Diff of /R/outbreaker_data.R [000000] .. [dfe06d]

Switch to side-by-side view

--- a
+++ b/R/outbreaker_data.R
@@ -0,0 +1,263 @@
+#' Process input data for outbreaker
+#'
+#' This function performs various checks on input data given to outbreaker.  It
+#' takes a list of named items as input, performs various checks, set defaults
+#' where arguments are missing, and return a correct list of data input. If no
+#' input is given, it returns the default settings.
+#'
+#' Acceptables arguments for ... are:
+#' \describe{
+#' \item{dates}{dates a vector indicating the collection dates, provided either as
+#' integer numbers or in a usual date format such as \code{Date} or
+#' \code{POSIXct} format. By convention, zero will indicate the oldest date. If
+#' the vector is named, the vector names will be used for matching cases to
+#' contact tracing data and labelled DNA sequences.}
+#'
+#' \item{dna}{the DNA sequences in \code{DNAbin} format (see
+#' \code{\link[ape]{read.dna}} in the ape package); this can be imported from a
+#' fasta file (extension .fa, .fas, or .fasta) using \code{adegenet}'s function
+#' \link[adegenet]{fasta2DNAbin}.}
+#'
+#' \item{ctd}{the contact tracing data provided as a matrix/dataframe of two
+#' columns, indicating a reported contact between the two individuals whose ids
+#' are provided in a given row of the data, or an epicontacts object. In the case
+#' of the latter, linelist IDs will be used for matching dates and DNA
+#' sequences.}
+#'
+#' \item{w_dens}{a vector of numeric values indicating the generation time
+#' distribution, reflecting the infectious potential of a case t = 1, 2, ...
+#' time steps after infection. By convention, it is assumed that
+#' newly infected patients cannot see new infections on the same time step. If not
+#' standardized, this distribution is rescaled to sum to 1.}
+#'
+#' \item{f_dens}{similar to \code{w_dens}, except that this is the distribution
+#' of the colonization time, i_e. time interval during which the pathogen can
+#' be sampled from the patient.}}
+#'
+#' @param ... a list of data items to be processed (see description).
+#'
+#' @param data optionally, an existing list of data item as returned by \code{outbreaker_data}.
+#'
+#' @author Thibaut Jombart (\email{thibautjombart@@gmail.com}).
+#'
+#' @export
+#'
+#' @examples
+#'
+#' x <- fake_outbreak
+#' outbreaker_data(dates = x$sample, dna = x$dna, w_dens = x$w)
+#'
+outbreaker_data <- function(..., data = list(...)) {
+
+  ## SET DEFAULTS ##
+  defaults <- list(dates = NULL,
+                   w_dens = NULL,
+                   f_dens = NULL,
+                   dna = NULL,
+                   ctd = NULL,
+                   N = 0L,
+                   L = 0L,
+                   D = NULL,
+                   max_range = NA,
+                   can_be_ances = NULL,
+                   log_w_dens = NULL,
+                   log_f_dens = NULL,
+                   contacts = NULL,
+                   C_combn = NULL,
+                   C_nrow = NULL,
+                   ids = NULL,
+                   has_dna = logical(0),
+                   id_in_dna = integer(0))
+
+  ## MODIFY DATA WITH ARGUMENTS ##
+  data <- modify_defaults(defaults, data, FALSE)
+
+  ## Set up case ids
+  if(is.null(data$ids)) {
+    if(!is.null(names(data$dates))) {
+      data$ids <- names(data$dates)
+    } else if(!is.null(data$ctd) & inherits(data$ctd, "epicontacts")){
+      data$ids <- as.character(data$ctd$linelist$id)
+    } else {
+      data$ids <- as.character(seq_along(data$dates))
+    }
+  }
+
+  ## CHECK DATA ##
+  ## CHECK DATES
+  if (!is.null(data$dates)) {
+    if (inherits(data$dates, "Date")) {
+      data$dates <- data$dates-min(data$dates)
+    }
+    if (inherits(data$dates, "POSIXct")) {
+      data$dates <- difftime(data$dates, min(data$dates), units="days")
+    }
+    if (inherits(data$dates, "numeric") && any(data$dates %% 1 != 0)) {
+      warning("Rounding non-integer dates to nearest integer")
+    }
+    data$dates <- as.integer(round(data$dates))
+    data$N <- length(data$dates)
+    data$max_range <- diff(range(data$dates))
+  }
+
+  ## CHECK W_DENS
+  if (!is.null(data$w_dens)) {
+    if (any(data$w_dens<0)) {
+      stop("w_dens has negative entries (these should be probabilities!)")
+    }
+
+    if (any(!is.finite(data$w_dens))) {
+      stop("non-finite values detected in w_dens")
+    }
+
+
+    ## Remove trailing zeroes to prevent starting with -Inf temporal loglike
+    if(data$w_dens[length(data$w_dens)] < 1e-15) {
+      final_index <- max(which(data$w_dens > 1e-15))
+      data$w_dens <- data$w_dens[1:final_index]
+    }
+
+    ## add an exponential tail summing to 1e-4 to 'w'
+    ## to cover the span of the outbreak
+    ## (avoids starting with -Inf temporal loglike)
+    if (length(data$w_dens) < data$max_range) {
+      length_to_add <- (data$max_range-length(data$w_dens)) + 10 # +10 to be on the safe side
+      val_to_add <- stats::dexp(seq_len(length_to_add), 1)
+      val_to_add <- 1e-4*(val_to_add/sum(val_to_add))
+      data$w_dens <- c(data$w_dens, val_to_add)
+    }
+
+    ## standardize the mass function
+    data$w_dens <- data$w_dens / sum(data$w_dens)
+    data$log_w_dens <- matrix(log(data$w_dens), nrow = 1)
+  }
+
+  ## CHECK F_DENS
+  if (!is.null(data$w_dens) && is.null(data$f_dens)) {
+    data$f_dens <- data$w_dens
+  }
+  if (!is.null(data$f_dens)) {
+    if (any(data$f_dens<0)) {
+      stop("f_dens has negative entries (these should be probabilities!)")
+    }
+
+    if (any(!is.finite(data$f_dens))) {
+      stop("non-finite values detected in f_dens")
+    }
+
+    data$f_dens <- data$f_dens / sum(data$f_dens)
+    data$log_f_dens <- log(data$f_dens)
+  }
+
+  ## CHECK POTENTIAL ANCESTRIES
+  if(!is.null(data$dates)) {
+    ## get temporal ordering constraint:
+    ## canBeAnces[i,j] is 'i' can be ancestor of 'j'
+    ## Calculate the serial interval from w_dens and f_dens
+    .get_SI <- function(w_dens, f_dens) {
+      wf <- stats::convolve(w_dens, rev(f_dens), type = 'open')
+      conv <- stats::convolve(rev(f_dens), rev(wf), type = 'open')
+      lf <- length(f_dens)
+      lw <- length(w_dens)
+      return(data.frame(x = (-lf + 2):(lw + lf - 1), d = conv))
+    }
+    ## Check if difference in sampling dates falls within serial interval
+    ## This allows for i to infect j even if it sampled after (SI < 0)
+    .can_be_ances <- function(date1, date2, SI) {
+      tdiff <- date2 - date1
+      out <- sapply(tdiff, function(i) return(i %in% SI$x))
+      return(out)
+    }
+    SI <- .get_SI(data$w_dens, data$f_dens)
+    data$can_be_ances <- outer(data$dates,
+                               data$dates,
+                               FUN=.can_be_ances,
+                               SI = SI) # strict < is needed as we impose w(0)=0
+    diag(data$can_be_ances) <- FALSE
+  }
+
+  ## CHECK DNA
+
+  if (!is.null(data$dna)) {
+    if (!inherits(data$dna, "DNAbin")) stop("dna is not a DNAbin object.")
+    if (!is.matrix(data$dna)) data$dna <- as.matrix(data$dna)
+
+    ## get matrix of distances
+
+    data$L <- ncol(data$dna) #  (genome length)
+    data$D <- as.matrix(ape::dist.dna(data$dna, model="N")) # distance matrix
+    storage.mode(data$D) <- "integer" # essential for C/C++ interface
+
+    ## get matching between sequences and cases
+
+    if (is.null(rownames(data$dna))) {
+      if (nrow(data$dna) != data$N) {
+        msg <- sprintf(paste("numbers of sequences and cases differ (%d vs %d):",
+                             "please label sequences"),
+                       nrow(data$dna), data$N)
+        stop(msg)
+      }
+
+      ## These need to be indices
+      rownames(data$D) <- colnames(data$D) <- seq_len(data$N)
+
+      ## These need to match dates/ctd ids
+      rownames(data$dna) <- data$ids
+    }
+    data$id_in_dna <- match(data$ids, rownames(data$dna))
+    if(any(is.na(match(rownames(data$dna), data$ids)))) {
+      stop("DNA sequence labels don't match case ids")
+    }
+
+  } else {
+    data$L <- 0L
+    data$D <- matrix(integer(0), ncol = 0, nrow = 0)
+    data$id_in_dna <- rep(NA_integer_, data$N)
+  }
+  data$has_dna <- !is.na(data$id_in_dna)
+
+
+  ## CHECK CTD
+
+  if (!is.null(data$ctd)) {
+    ctd <- data$ctd
+    if (inherits(ctd, c("matrix", "data.frame"))) {
+      ## prevent factor -> integer conversion
+      ctd <- apply(ctd, 2, as.character)
+      if (!is.matrix(ctd)) {
+        ctd <- as.matrix(ctd)
+      }
+      if(ncol(ctd) != 2) {
+        stop("ctd must contain two columns")
+      }
+    } else if(inherits(ctd, "epicontacts")) {
+      ## prevent factor -> integer conversion
+      ctd <- apply(ctd$contacts[c("from", "to")], 2, as.character)
+    } else {
+      stop("ctd is not a matrix, data.frame or epicontacts object")
+    }
+
+    unq <- unique(as.vector(ctd[,1:2]))
+    not_found <- unq[!unq %in% data$ids]
+    if (length(not_found) != 0) {
+      not_found <- sort(unique(not_found))
+      stop(paste("Individual(s)", paste(not_found, collapse = ", "),
+                 "in ctd are unknown cases (idx < 1 or > N")
+           )
+    }
+    contacts <- matrix(0, data$N, data$N)
+    mtch_1 <- match(ctd[,1], data$ids)
+    mtch_2 <- match(ctd[,2], data$ids)
+    contacts[cbind(mtch_2, mtch_1)] <- 1
+
+    data$contacts <- contacts
+    data$C_combn <- data$N*(data$N - 1)
+    data$C_nrow <- nrow(ctd)
+  } else {
+    data$contacts <- matrix(integer(0), ncol = 0, nrow = 0)
+  }
+
+  ## output is a list of checked data
+  return(data)
+
+}