Diff of /R/outbreaker_data.R [000000] .. [dfe06d]

Switch to unified view

a b/R/outbreaker_data.R
1
#' Process input data for outbreaker
2
#'
3
#' This function performs various checks on input data given to outbreaker.  It
4
#' takes a list of named items as input, performs various checks, set defaults
5
#' where arguments are missing, and return a correct list of data input. If no
6
#' input is given, it returns the default settings.
7
#'
8
#' Acceptables arguments for ... are:
9
#' \describe{
10
#' \item{dates}{dates a vector indicating the collection dates, provided either as
11
#' integer numbers or in a usual date format such as \code{Date} or
12
#' \code{POSIXct} format. By convention, zero will indicate the oldest date. If
13
#' the vector is named, the vector names will be used for matching cases to
14
#' contact tracing data and labelled DNA sequences.}
15
#'
16
#' \item{dna}{the DNA sequences in \code{DNAbin} format (see
17
#' \code{\link[ape]{read.dna}} in the ape package); this can be imported from a
18
#' fasta file (extension .fa, .fas, or .fasta) using \code{adegenet}'s function
19
#' \link[adegenet]{fasta2DNAbin}.}
20
#'
21
#' \item{ctd}{the contact tracing data provided as a matrix/dataframe of two
22
#' columns, indicating a reported contact between the two individuals whose ids
23
#' are provided in a given row of the data, or an epicontacts object. In the case
24
#' of the latter, linelist IDs will be used for matching dates and DNA
25
#' sequences.}
26
#'
27
#' \item{w_dens}{a vector of numeric values indicating the generation time
28
#' distribution, reflecting the infectious potential of a case t = 1, 2, ...
29
#' time steps after infection. By convention, it is assumed that
30
#' newly infected patients cannot see new infections on the same time step. If not
31
#' standardized, this distribution is rescaled to sum to 1.}
32
#'
33
#' \item{f_dens}{similar to \code{w_dens}, except that this is the distribution
34
#' of the colonization time, i_e. time interval during which the pathogen can
35
#' be sampled from the patient.}}
36
#'
37
#' @param ... a list of data items to be processed (see description).
38
#'
39
#' @param data optionally, an existing list of data item as returned by \code{outbreaker_data}.
40
#'
41
#' @author Thibaut Jombart (\email{thibautjombart@@gmail.com}).
42
#'
43
#' @export
44
#'
45
#' @examples
46
#'
47
#' x <- fake_outbreak
48
#' outbreaker_data(dates = x$sample, dna = x$dna, w_dens = x$w)
49
#'
50
outbreaker_data <- function(..., data = list(...)) {
51
52
  ## SET DEFAULTS ##
53
  defaults <- list(dates = NULL,
54
                   w_dens = NULL,
55
                   f_dens = NULL,
56
                   dna = NULL,
57
                   ctd = NULL,
58
                   N = 0L,
59
                   L = 0L,
60
                   D = NULL,
61
                   max_range = NA,
62
                   can_be_ances = NULL,
63
                   log_w_dens = NULL,
64
                   log_f_dens = NULL,
65
                   contacts = NULL,
66
                   C_combn = NULL,
67
                   C_nrow = NULL,
68
                   ids = NULL,
69
                   has_dna = logical(0),
70
                   id_in_dna = integer(0))
71
72
  ## MODIFY DATA WITH ARGUMENTS ##
73
  data <- modify_defaults(defaults, data, FALSE)
74
75
  ## Set up case ids
76
  if(is.null(data$ids)) {
77
    if(!is.null(names(data$dates))) {
78
      data$ids <- names(data$dates)
79
    } else if(!is.null(data$ctd) & inherits(data$ctd, "epicontacts")){
80
      data$ids <- as.character(data$ctd$linelist$id)
81
    } else {
82
      data$ids <- as.character(seq_along(data$dates))
83
    }
84
  }
85
86
  ## CHECK DATA ##
87
  ## CHECK DATES
88
  if (!is.null(data$dates)) {
89
    if (inherits(data$dates, "Date")) {
90
      data$dates <- data$dates-min(data$dates)
91
    }
92
    if (inherits(data$dates, "POSIXct")) {
93
      data$dates <- difftime(data$dates, min(data$dates), units="days")
94
    }
95
    if (inherits(data$dates, "numeric") && any(data$dates %% 1 != 0)) {
96
      warning("Rounding non-integer dates to nearest integer")
97
    }
98
    data$dates <- as.integer(round(data$dates))
99
    data$N <- length(data$dates)
100
    data$max_range <- diff(range(data$dates))
101
  }
102
103
  ## CHECK W_DENS
104
  if (!is.null(data$w_dens)) {
105
    if (any(data$w_dens<0)) {
106
      stop("w_dens has negative entries (these should be probabilities!)")
107
    }
108
109
    if (any(!is.finite(data$w_dens))) {
110
      stop("non-finite values detected in w_dens")
111
    }
112
113
114
    ## Remove trailing zeroes to prevent starting with -Inf temporal loglike
115
    if(data$w_dens[length(data$w_dens)] < 1e-15) {
116
      final_index <- max(which(data$w_dens > 1e-15))
117
      data$w_dens <- data$w_dens[1:final_index]
118
    }
119
120
    ## add an exponential tail summing to 1e-4 to 'w'
121
    ## to cover the span of the outbreak
122
    ## (avoids starting with -Inf temporal loglike)
123
    if (length(data$w_dens) < data$max_range) {
124
      length_to_add <- (data$max_range-length(data$w_dens)) + 10 # +10 to be on the safe side
125
      val_to_add <- stats::dexp(seq_len(length_to_add), 1)
126
      val_to_add <- 1e-4*(val_to_add/sum(val_to_add))
127
      data$w_dens <- c(data$w_dens, val_to_add)
128
    }
129
130
    ## standardize the mass function
131
    data$w_dens <- data$w_dens / sum(data$w_dens)
132
    data$log_w_dens <- matrix(log(data$w_dens), nrow = 1)
133
  }
134
135
  ## CHECK F_DENS
136
  if (!is.null(data$w_dens) && is.null(data$f_dens)) {
137
    data$f_dens <- data$w_dens
138
  }
139
  if (!is.null(data$f_dens)) {
140
    if (any(data$f_dens<0)) {
141
      stop("f_dens has negative entries (these should be probabilities!)")
142
    }
143
144
    if (any(!is.finite(data$f_dens))) {
145
      stop("non-finite values detected in f_dens")
146
    }
147
148
    data$f_dens <- data$f_dens / sum(data$f_dens)
149
    data$log_f_dens <- log(data$f_dens)
150
  }
151
152
  ## CHECK POTENTIAL ANCESTRIES
153
  if(!is.null(data$dates)) {
154
    ## get temporal ordering constraint:
155
    ## canBeAnces[i,j] is 'i' can be ancestor of 'j'
156
    ## Calculate the serial interval from w_dens and f_dens
157
    .get_SI <- function(w_dens, f_dens) {
158
      wf <- stats::convolve(w_dens, rev(f_dens), type = 'open')
159
      conv <- stats::convolve(rev(f_dens), rev(wf), type = 'open')
160
      lf <- length(f_dens)
161
      lw <- length(w_dens)
162
      return(data.frame(x = (-lf + 2):(lw + lf - 1), d = conv))
163
    }
164
    ## Check if difference in sampling dates falls within serial interval
165
    ## This allows for i to infect j even if it sampled after (SI < 0)
166
    .can_be_ances <- function(date1, date2, SI) {
167
      tdiff <- date2 - date1
168
      out <- sapply(tdiff, function(i) return(i %in% SI$x))
169
      return(out)
170
    }
171
    SI <- .get_SI(data$w_dens, data$f_dens)
172
    data$can_be_ances <- outer(data$dates,
173
                               data$dates,
174
                               FUN=.can_be_ances,
175
                               SI = SI) # strict < is needed as we impose w(0)=0
176
    diag(data$can_be_ances) <- FALSE
177
  }
178
179
  ## CHECK DNA
180
181
  if (!is.null(data$dna)) {
182
    if (!inherits(data$dna, "DNAbin")) stop("dna is not a DNAbin object.")
183
    if (!is.matrix(data$dna)) data$dna <- as.matrix(data$dna)
184
185
    ## get matrix of distances
186
187
    data$L <- ncol(data$dna) #  (genome length)
188
    data$D <- as.matrix(ape::dist.dna(data$dna, model="N")) # distance matrix
189
    storage.mode(data$D) <- "integer" # essential for C/C++ interface
190
191
    ## get matching between sequences and cases
192
193
    if (is.null(rownames(data$dna))) {
194
      if (nrow(data$dna) != data$N) {
195
        msg <- sprintf(paste("numbers of sequences and cases differ (%d vs %d):",
196
                             "please label sequences"),
197
                       nrow(data$dna), data$N)
198
        stop(msg)
199
      }
200
201
      ## These need to be indices
202
      rownames(data$D) <- colnames(data$D) <- seq_len(data$N)
203
204
      ## These need to match dates/ctd ids
205
      rownames(data$dna) <- data$ids
206
    }
207
    data$id_in_dna <- match(data$ids, rownames(data$dna))
208
    if(any(is.na(match(rownames(data$dna), data$ids)))) {
209
      stop("DNA sequence labels don't match case ids")
210
    }
211
212
  } else {
213
    data$L <- 0L
214
    data$D <- matrix(integer(0), ncol = 0, nrow = 0)
215
    data$id_in_dna <- rep(NA_integer_, data$N)
216
  }
217
  data$has_dna <- !is.na(data$id_in_dna)
218
219
220
  ## CHECK CTD
221
222
  if (!is.null(data$ctd)) {
223
    ctd <- data$ctd
224
    if (inherits(ctd, c("matrix", "data.frame"))) {
225
      ## prevent factor -> integer conversion
226
      ctd <- apply(ctd, 2, as.character)
227
      if (!is.matrix(ctd)) {
228
        ctd <- as.matrix(ctd)
229
      }
230
      if(ncol(ctd) != 2) {
231
        stop("ctd must contain two columns")
232
      }
233
    } else if(inherits(ctd, "epicontacts")) {
234
      ## prevent factor -> integer conversion
235
      ctd <- apply(ctd$contacts[c("from", "to")], 2, as.character)
236
    } else {
237
      stop("ctd is not a matrix, data.frame or epicontacts object")
238
    }
239
240
    unq <- unique(as.vector(ctd[,1:2]))
241
    not_found <- unq[!unq %in% data$ids]
242
    if (length(not_found) != 0) {
243
      not_found <- sort(unique(not_found))
244
      stop(paste("Individual(s)", paste(not_found, collapse = ", "),
245
                 "in ctd are unknown cases (idx < 1 or > N")
246
           )
247
    }
248
    contacts <- matrix(0, data$N, data$N)
249
    mtch_1 <- match(ctd[,1], data$ids)
250
    mtch_2 <- match(ctd[,2], data$ids)
251
    contacts[cbind(mtch_2, mtch_1)] <- 1
252
253
    data$contacts <- contacts
254
    data$C_combn <- data$N*(data$N - 1)
255
    data$C_nrow <- nrow(ctd)
256
  } else {
257
    data$contacts <- matrix(integer(0), ncol = 0, nrow = 0)
258
  }
259
260
  ## output is a list of checked data
261
  return(data)
262
263
}