--- a +++ b/src/likelihoods.cpp @@ -0,0 +1,619 @@ +#include <Rmath.h> +#include <Rcpp.h> +#include "internals.h" +#include "likelihoods.h" + + +// IMPORTANT: ON INDEXING VECTORS AND ANCESTRIES + +// Most of the functions implemented here are susceptible to be called from R +// via Rcpp, and are therefore treated as interfaces. This causes a number of +// headaches when using indices of cases defined in R (1:N) to refer to elements +// in Rcpp / Cpp vectors (0:N-1). By convention, we store all data on the +// original scale (1:N), and modify indices whenever accessing elements of +// vectors. In other words, in an expression like 'alpha[j]', 'j' should always +// be on the internal scale (0:N-1). + +// In all these functions, 'SEXP i' is an optional vector of case indices, on +// the 1:N scale. + + + + + + +// --------------------------- + +// This likelihood corresponds to the probability of observing a number of +// mutations between cases and their ancestors. See src/likelihoods.cpp for +// details of the Rcpp implmentation. + +// The likelihood is based on the number of mutations between a case and its +// ancestor; these are extracted from a pairwise genetic distance matrix +// (data$D) the log-likelihood is computed as: sum(mu^nmut + (1-mu)^(L-nmut)) +// with: + +// 'mu' is the mutation probability +// 'L' the number of sites in the alignment +// 'n_mut' the number of mutations between an ancestor and its descendent +// 'n_non_mut' the number of sites that have not mutated + +// For any given case at 'nmut' mutations from its ancestor, with kappa +// generations in between, the log-likelihood is defined as: + +// log(mu) * n_mut + log(1 - mu) * {(L - n_mut) + (L * (kappa-1))} + + +// when summing over several individuals, it becomes: + +// log(mu) * sum_i(n_mut_i) + log(1-mu) * sum_i((L - n_mut_i) + (L * (kappa_i - 1))) + +double cpp_ll_genetic(Rcpp::List data, Rcpp::List param, SEXP i, + Rcpp::RObject custom_function) { + Rcpp::IntegerMatrix D = data["D"]; + if (D.ncol() < 1) return 0.0; + + size_t N = static_cast<size_t>(data["N"]); + if (N < 2) return 0.0; + + Rcpp::List l; + if (custom_function != R_NilValue) { + l = Rcpp::as<Rcpp::List>(custom_function); + } + if (custom_function == R_NilValue || (l.size() > 0 && l[0] == R_NilValue)) { + + // Variables from the data & param + Rcpp::NumericMatrix w_dens = data["log_w_dens"]; + size_t K = w_dens.nrow(); + double mu = Rcpp::as<double>(param["mu"]); + long int L = Rcpp::as<int>(data["L"]); + Rcpp::IntegerVector alpha = param["alpha"]; // values are on 1:N + Rcpp::IntegerVector kappa = param["kappa"]; + Rcpp::LogicalVector has_dna = data["has_dna"]; + + + // Local variables used for computatoins + size_t n_mut = 0; + size_t n_non_mut = 0; + double out = 0; + bool found[1]; + size_t ances[1]; + size_t n_generations[1]; + found[0] = false; + ances[0] = NA_INTEGER; + n_generations[0] = NA_INTEGER; + + + // Invalid values of mu + if (mu < 0.0 || mu > 1.0) { + return R_NegInf; + } + + + // NOTE ON MISSING SEQUENCES + + // Terms participating to the genetic likelihood correspond to pairs + // of ancestor-descendent which have a genetic sequence. The + // log-likelihood of other pairs is 0.0, and can therefore be + // ommitted. Note the possible source of confusion in indices here: + + // 'has_dna' is a vector, thus indexed from 0:(N-1) + + // 'cpp_get_n_mutations' is a function, and thus takes indices on 1:N + + + + // all cases are retained + + if (i == R_NilValue) { + for (size_t j = 0; j < N; j++) { // 'j' on 0:(N-1) + if (alpha[j] != NA_INTEGER) { + + // kappa restriction + + if (kappa[j] < 1 || kappa[j] > K) { + return R_NegInf; + } + + // missing sequences handled here + + if (has_dna[j]) { + + lookup_sequenced_ancestor(alpha, kappa, has_dna, j + 1, + ances, n_generations, found); + + if (found[0]) { + + n_mut = cpp_get_n_mutations(data, j + 1, ances[0]); // remember the offset + n_non_mut = L - n_mut; + + out += n_mut*log(n_generations[0]*mu) + + n_non_mut*log(1 - n_generations[0]*mu); + + } + } + } + } + + } else { + // only the cases listed in 'i' are retained + size_t length_i = static_cast<size_t>(LENGTH(i)); + Rcpp::IntegerVector vec_i(i); + for (size_t k = 0; k < length_i; k++) { + size_t j = vec_i[k] - 1; // offset + if (alpha[j] != NA_INTEGER) { + // kappa restriction + if (kappa[j] < 1 || kappa[j] > K) { + return R_NegInf; + } + + // missing sequences handled here + + if (has_dna[j]) { + + lookup_sequenced_ancestor(alpha, kappa, has_dna, j + 1, + ances, n_generations, found); + + if (found[0]) { + + n_mut = cpp_get_n_mutations(data, j + 1, ances[0]); // remember the offset + n_non_mut = L - n_mut; + + out += n_mut*log(n_generations[0]*mu) + + n_non_mut*log(1 - n_generations[0]*mu); + + } + } + + } + + } + } + + return(out); + + } else { // use of a customized likelihood function + Rcpp::Function f = Rcpp::as<Rcpp::Function>(l[0]); + int arity = l[1]; + if (arity == 3) return Rcpp::as<double>(f(data, param, i)); + return Rcpp::as<double>(f(data, param)); + } +} + + +double cpp_ll_genetic(Rcpp::List data, Rcpp::List param, size_t i, + Rcpp::RObject custom_function) { + SEXP si = PROTECT(Rcpp::wrap(i)); + double ret = cpp_ll_genetic(data, param, si, custom_function); + UNPROTECT(1); + return ret; +} + + + + + +// --------------------------- + +// This likelihood corresponds to the probability of observing infection dates +// of cases given the infection dates of their ancestors. + +double cpp_ll_timing_infections(Rcpp::List data, Rcpp::List param, SEXP i, + Rcpp::RObject custom_function) { + size_t N = static_cast<size_t>(data["N"]); + if(N < 2) return 0.0; + + Rcpp::List l; + if (custom_function != R_NilValue) { + l = Rcpp::as<Rcpp::List>(custom_function); + } + if (custom_function == R_NilValue || (l.size() > 0 && l[0] == R_NilValue)) { + + Rcpp::IntegerVector alpha = param["alpha"]; + Rcpp::IntegerVector t_inf = param["t_inf"]; + Rcpp::IntegerVector kappa = param["kappa"]; + Rcpp::NumericMatrix w_dens = data["log_w_dens"]; + size_t K = w_dens.nrow(); + + double out = 0.0; + + // all cases are retained + if (i == R_NilValue) { + for (size_t j = 0; j < N; j++) { + if (alpha[j] != NA_INTEGER) { + size_t delay = t_inf[j] - t_inf[alpha[j] - 1]; // offset + if (delay < 1 || delay > w_dens.ncol()) { + return R_NegInf; + } + if (kappa[j] < 1 || kappa[j] > K) { + return R_NegInf; + } + + out += w_dens(kappa[j] - 1, delay - 1); + } + } + } else { + // only the cases listed in 'i' are retained + size_t length_i = static_cast<size_t>(LENGTH(i)); + Rcpp::IntegerVector vec_i(i); + for (size_t k = 0; k < length_i; k++) { + size_t j = vec_i[k] - 1; // offset + if (alpha[j] != NA_INTEGER) { + size_t delay = t_inf[j] - t_inf[alpha[j] - 1]; // offset + if (delay < 1 || delay > w_dens.ncol()) { + return R_NegInf; + } + if (kappa[j] < 1 || kappa[j] > K) { + return R_NegInf; + } + + out += w_dens(kappa[j] - 1, delay - 1); + } + + } + } + + return out; + } else { // use of a customized likelihood function + Rcpp::Function f = Rcpp::as<Rcpp::Function>(l[0]); + int arity = l[1]; + if (arity == 3) return Rcpp::as<double>(f(data, param, i)); + return Rcpp::as<double>(f(data, param)); + } +} + + +double cpp_ll_timing_infections(Rcpp::List data, Rcpp::List param, size_t i, + Rcpp::RObject custom_function) { + SEXP si = PROTECT(Rcpp::wrap(i)); + double ret = cpp_ll_timing_infections(data, param, si, custom_function); + UNPROTECT(1); + return ret; +} + + + + + +// --------------------------- + +// This likelihood corresponds to the probability of reporting dates of cases +// given their infection dates. + +double cpp_ll_timing_sampling(Rcpp::List data, Rcpp::List param, SEXP i, + Rcpp::RObject custom_function) { + size_t N = static_cast<size_t>(data["N"]); + if(N < 2) return 0.0; + + Rcpp::List l; + if (custom_function != R_NilValue) { + l = Rcpp::as<Rcpp::List>(custom_function); + } + if (custom_function == R_NilValue || (l.size() > 0 && l[0] == R_NilValue)) { + + Rcpp::IntegerVector dates = data["dates"]; + Rcpp::IntegerVector t_inf = param["t_inf"]; + Rcpp::NumericVector f_dens = data["log_f_dens"]; + + double out = 0.0; + + // all cases are retained + if (i == R_NilValue) { + for (size_t j = 0; j < N; j++) { + size_t delay = dates[j] - t_inf[j]; + if (delay < 1 || delay > f_dens.size()) { + return R_NegInf; + } + out += f_dens[delay - 1]; + } + } else { + // only the cases listed in 'i' are retained + size_t length_i = static_cast<size_t>(LENGTH(i)); + Rcpp::IntegerVector vec_i(i); + for (size_t k = 0; k < length_i; k++) { + size_t j = vec_i[k] - 1; // offset + size_t delay = dates[j] - t_inf[j]; + if (delay < 1 || delay > f_dens.size()) { + return R_NegInf; + } + out += f_dens[delay - 1]; + } + } + + return out; + } else { // use of a customized likelihood function + Rcpp::Function f = Rcpp::as<Rcpp::Function>(l[0]); + int arity = l[1]; + if (arity == 3) return Rcpp::as<double>(f(data, param, i)); + return Rcpp::as<double>(f(data, param)); + } +} + + +double cpp_ll_timing_sampling(Rcpp::List data, Rcpp::List param, size_t i, + Rcpp::RObject custom_function) { + SEXP si = PROTECT(Rcpp::wrap(i)); + double ret = cpp_ll_timing_sampling(data, param, si, custom_function); + UNPROTECT(1); + return ret; +} + + + + + +// --------------------------- + +// This likelihood corresponds to the probability of a given number of +// unreported cases on an ancestry. + +// The likelihood is given by a geometric distribution with probability 'pi' +// to report a case + +// - 'kappa' is the number of generation between two successive cases +// - 'kappa-1' is the number of unreported cases + +double cpp_ll_reporting(Rcpp::List data, Rcpp::List param, SEXP i, + Rcpp::RObject custom_function) { + Rcpp::NumericMatrix w_dens = data["log_w_dens"]; + size_t K = w_dens.nrow(); + + size_t N = static_cast<size_t>(data["N"]); + if(N < 2) return 0.0; + + double pi = static_cast<double>(param["pi"]); + Rcpp::IntegerVector kappa = param["kappa"]; + + // p(pi < 0) = p(pi > 1) = 0 + if (pi < 0.0 || pi > 1.0) { + return R_NegInf; + } + + Rcpp::List l; + if (custom_function != R_NilValue) { + l = Rcpp::as<Rcpp::List>(custom_function); + } + if (custom_function == R_NilValue || (l.size() > 0 && l[0] == R_NilValue)) { + + double out = 0.0; + + // all cases are retained + if (i == R_NilValue) { + for (size_t j = 0; j < N; j++) { + if (kappa[j] != NA_INTEGER) { + if (kappa[j] < 1 || kappa[j] > K) { + return R_NegInf; + } + out += R::dgeom(kappa[j] - 1.0, pi, 1); // first arg must be cast to double + } + } + } else { + // only the cases listed in 'i' are retained + size_t length_i = static_cast<size_t>(LENGTH(i)); + Rcpp::IntegerVector vec_i(i); + for (size_t k = 0; k < length_i; k++) { + size_t j = vec_i[k] - 1; // offset + if (kappa[j] != NA_INTEGER) { + if (kappa[j] < 1 || kappa[j] > K) { + return R_NegInf; + } + out += R::dgeom(kappa[j] - 1.0, pi, 1); // first arg must be cast to double + } + } + } + + return out; + } else { // use of a customized likelihood function + Rcpp::Function f = Rcpp::as<Rcpp::Function>(l[0]); + int arity = l[1]; + if (arity == 3) return Rcpp::as<double>(f(data, param, i)); + return Rcpp::as<double>(f(data, param)); + } +} + + +double cpp_ll_reporting(Rcpp::List data, Rcpp::List param, size_t i, + Rcpp::RObject custom_function) { + SEXP si = PROTECT(Rcpp::wrap(i)); + double ret = cpp_ll_reporting(data, param, si, custom_function); + UNPROTECT(1); + return ret; +} + + + + + +// --------------------------- + +// This likelihood corresponds to the probability of observing a a reported +// contact between cases and their ancestors. See +// src/likelihoods.cpp for details of the Rcpp implmentation. + +// The likelihood is based on the contact status between a case and its +// ancestor; this is extracted from a pairwise contact matrix (data$C), the +// log-likelihood is computed as: +// true_pos*eps + false_pos*eps*xi + +// false_neg*(1- eps) + true_neg*(1 - eps*xi) +// +// with: +// 'eps' is the contact reporting coverage +// 'lambda' is the non-infectious contact rate +// 'true_pos' is the number of contacts between transmission pairs +// 'false_pos' is the number of contact between non-transmission pairs +// 'false_neg' is the number of transmission pairs without contact +// 'true_neg' is the number of non-transmission pairs without contact + +double cpp_ll_contact(Rcpp::List data, Rcpp::List param, SEXP i, + Rcpp::RObject custom_function) { + Rcpp::NumericMatrix contacts = data["contacts"]; + if (contacts.ncol() < 1) return 0.0; + + size_t C_combn = static_cast<size_t>(data["C_combn"]); + size_t C_nrow = static_cast<size_t>(data["C_nrow"]); + + size_t N = static_cast<size_t>(data["N"]); + if (N < 2) return 0.0; + + Rcpp::List l; + if (custom_function != R_NilValue) { + l = Rcpp::as<Rcpp::List>(custom_function); + } + if (custom_function == R_NilValue || (l.size() > 0 && l[0] == R_NilValue)) { + + double out; + double eps = Rcpp::as<double>(param["eps"]); + double lambda = Rcpp::as<double>(param["lambda"]); + Rcpp::IntegerVector alpha = param["alpha"]; + Rcpp::IntegerVector kappa = param["kappa"]; + + size_t true_pos = 0; + size_t false_pos = 0; + size_t false_neg = 0; + size_t true_neg = 0; + size_t imports = 0; + size_t unobsv_case = 0; + + // p(eps < 0 || lambda < 0) = 0 + if (eps < 0.0 || lambda < 0.0) { + return R_NegInf; + } + + // all cases are retained (currently no support for i subsetting) + for (size_t j = 0; j < N; j++) { + if (alpha[j] == NA_INTEGER) { + imports += 1; + } else if (kappa[j] > 1) { + unobsv_case += 1; + } else { + true_pos += contacts(j, alpha[j] - 1); // offset + } + } + + false_pos = C_nrow - true_pos; + false_neg = N - imports - unobsv_case - true_pos; + true_neg = C_combn - true_pos - false_pos - false_neg; + + // deal with special case when lambda == 0 and eps == 1, to avoid log(0) + if(lambda == 0.0) { + if(false_pos > 0) { + out = R_NegInf; + } else { + out = log(eps) * (double) true_pos + + log(1 - eps) * (double) false_neg + + log(1 - eps*lambda) * (double) true_neg; + } + } else if(eps == 1.0) { + if(false_neg > 0) { + out = R_NegInf; + } else { + out = log(eps) * (double) true_pos + + log(eps*lambda) * (double) false_pos + + log(1 - eps*lambda) * (double) true_neg; + } + } else { + out = log(eps) * (double) true_pos + + log(eps*lambda) * (double) false_pos + + log(1 - eps) * (double) false_neg + + log(1 - eps*lambda) * (double) true_neg; + } + + return out; + + } else { //use of a customized likelihood function + Rcpp::Function f = Rcpp::as<Rcpp::Function>(l[0]); + int arity = l[1]; + if (arity == 3) return Rcpp::as<double>(f(data, param, i)); + return Rcpp::as<double>(f(data, param)); + } +} + + +double cpp_ll_contact(Rcpp::List data, Rcpp::List param, size_t i, + Rcpp::RObject custom_function) { + SEXP si = PROTECT(Rcpp::wrap(i)); + double ret = cpp_ll_contact(data, param, si, custom_function); + UNPROTECT(1); + return ret; +} + + + + + +// --------------------------- + +// This likelihood corresponds to the sums of the separate timing likelihoods, +// which include: + +// - p(infection dates): see function cpp_ll_timing_infections +// - p(collection dates): see function cpp_ll_timing_sampling + +double cpp_ll_timing(Rcpp::List data, Rcpp::List param, SEXP i, + Rcpp::RObject custom_functions) { + + if (custom_functions == R_NilValue) { + return cpp_ll_timing_infections(data, param, i) + + cpp_ll_timing_sampling(data, param, i); + } else { // use of a customized likelihood functions + Rcpp::List list_functions = Rcpp::as<Rcpp::List>(custom_functions); + return cpp_ll_timing_infections(data, param, i, list_functions["timing_infections"]) + + cpp_ll_timing_sampling(data, param, i, list_functions["timing_sampling"]); + + } +} + + +double cpp_ll_timing(Rcpp::List data, Rcpp::List param, size_t i, + Rcpp::RObject custom_function) { + SEXP si = PROTECT(Rcpp::wrap(i)); + double ret = cpp_ll_timing(data, param, si, custom_function); + UNPROTECT(1); + return ret; +} + + + + +// --------------------------- + +// This likelihood corresponds to the sums of the separate likelihoods, which +// include: + +// - p(infection dates): see function cpp_ll_timing_infections +// - p(collection dates): see function cpp_ll_timing_sampling +// - p(genetic diversity): see function cpp_ll_genetic +// - p(missing cases): see function cpp_ll_reporting +// - p(contact): see function cpp_ll_contact + +double cpp_ll_all(Rcpp::List data, Rcpp::List param, SEXP i, + Rcpp::RObject custom_functions) { + + if (custom_functions == R_NilValue) { + + return cpp_ll_timing_infections(data, param, i) + + cpp_ll_timing_sampling(data, param, i) + + cpp_ll_genetic(data, param, i) + + cpp_ll_reporting(data, param, i) + + cpp_ll_contact(data, param, i); + + } else { // use of a customized likelihood functions + Rcpp::List list_functions = Rcpp::as<Rcpp::List>(custom_functions); + + return cpp_ll_timing_infections(data, param, i, list_functions["timing_infections"]) + + cpp_ll_timing_sampling(data, param, i, list_functions["timing_sampling"]) + + cpp_ll_genetic(data, param, i, list_functions["genetic"]) + + cpp_ll_reporting(data, param, i, list_functions["reporting"]) + + cpp_ll_contact(data, param, i, list_functions["contact"]); + + } +} + + +double cpp_ll_all(Rcpp::List data, Rcpp::List param, size_t i, + Rcpp::RObject custom_function) { + SEXP si = PROTECT(Rcpp::wrap(i)); + double ret = cpp_ll_all(data, param, si, custom_function); + UNPROTECT(1); + return ret; +}