--- a +++ b/partyMod/R/MOB-Utils.R @@ -0,0 +1,416 @@ +########################### +## convenience functions ## +########################### + +## obtain the number/ID for all terminal nodes +terminal_nodeIDs <- function(node) { + if(node$terminal) return(node$nodeID) + ll <- terminal_nodeIDs(node$left) + rr <- terminal_nodeIDs(node$right) + return(c(ll, rr)) +} + + +######################### +## workhorse functions ## +######################### + +### determine which observations go left or right +mob_fit_childweights <- function(node, mf, weights) { + + partvar <- mf@get("part") + xselect <- partvar[[node$psplit$variableID]] + + ## we need to coerce ordered factors to numeric + ## this is what party C code does as well! + + if (class(node$psplit) == "orderedSplit") { + leftweights <- (as.double(xselect) <= node$psplit$splitpoint) * weights + rightweights <- (as.double(xselect) > node$psplit$splitpoint) * weights + } else { + leftweights <- (xselect %in% + levels(xselect)[as.logical(node$psplit$splitpoint)]) * weights + rightweights <- (!(xselect %in% + levels(xselect)[as.logical(node$psplit$splitpoint)])) * weights + } + + list(left = leftweights, right = rightweights) +} + +### setup a new (inner or terminal) node of a tree +mob_fit_setupnode <- function(obj, mf, weights, control) { + + ### control parameters + alpha <- control$alpha + bonferroni <- control$bonferroni + minsplit <- control$minsplit + trim <- control$trim + objfun <- control$objfun + verbose <- control$verbose + breakties <- control$breakties + parm <- control$parm + + ### if too few observations: no split = return terminal node + if (sum(weights) < 2 * minsplit) { + node <- list(nodeID = NULL, weights = weights, + criterion = list(statistic = 0, criterion = 0, maxcriterion = 0), + terminal = TRUE, psplit = NULL, ssplits = NULL, + prediction = 0, left = NULL, right = NULL, + sumweights = as.double(sum(weights))) + class(node) <- "TerminalModelNode" + return(node) + } + + ### variable selection via fluctuation tests + test <- try(mob_fit_fluctests(obj, mf, minsplit = minsplit, trim = trim, + breakties = breakties, parm = parm)) + + if (!inherits(test, "try-error")) { + if(bonferroni) { + pval1 <- pmin(1, sum(!is.na(test$pval)) * test$pval) + pval2 <- 1 - (1-test$pval)^sum(!is.na(test$pval)) + test$pval <- ifelse(!is.na(test$pval) & (test$pval > 0.01), pval2, pval1) + } + + best <- test$best + TERMINAL <- is.na(best) || test$pval[best] > alpha + + if (verbose) { + cat("\n-------------------------------------------\nFluctuation tests of splitting variables:\n") + print(rbind(statistic = test$stat, p.value = test$pval)) + cat("\nBest splitting variable: ") + cat(names(test$stat)[best]) + cat("\nPerform split? ") + cat(ifelse(TERMINAL, "no", "yes")) + cat("\n-------------------------------------------\n") + } + } else { + TERMINAL <- TRUE + test <- list(stat = NA, pval = NA) + } + + ### splitting + na_max <- function(x) { + if(all(is.na(x))) NA else max(x, na.rm = TRUE) + } + if (TERMINAL) { + node <- list(nodeID = NULL, weights = weights, + criterion = list(statistic = test$stat, + criterion = 1 - test$pval, + maxcriterion = na_max(1 - test$pval)), + terminal = TRUE, psplit = NULL, ssplits = NULL, + prediction = 0, left = NULL, right = NULL, + sumweights = as.double(sum(weights))) + class(node) <- "TerminalModelNode" + return(node) + } else { + partvar <- mf@get("part") + xselect <- partvar[[best]] + thissplit <- mob_fit_splitnode(xselect, obj, mf, weights, minsplit = minsplit, + objfun = objfun, verbose = verbose) + + ## check if splitting was unsuccessful + if (identical(FALSE, thissplit)) { + node <- list(nodeID = NULL, weights = weights, + criterion = list(statistic = test$stat, + criterion = 1 - test$pval, + maxcriterion = na_max(1 - test$pval)), + terminal = TRUE, psplit = NULL, ssplits = NULL, + prediction = 0, left = NULL, right = NULL, + sumweights = as.double(sum(weights))) + class(node) <- "TerminalModelNode" + + ### more confusion than information + ### warning("no admissable split found", call. = FALSE) + if(verbose) + cat(paste("\nNo admissable split found in ", sQuote(names(test$stat)[best]), "\n", sep = "")) + return(node) + } + + thissplit$variableID <- best + thissplit$variableName <- names(partvar)[best] + node <- list(nodeID = NULL, weights = weights, + criterion = list(statistic = test$stat, + criterion = 1 - test$pval, + maxcriterion = na_max(1 - test$pval)), + terminal = FALSE, + psplit = thissplit, ssplits = NULL, + prediction = 0, left = NULL, right = NULL, + sumweights = as.double(sum(weights))) + class(node) <- "SplittingNode" + } + + node$variableID <- best + if (verbose) { + cat("\nNode properties:\n") + print(node$psplit, left = TRUE) + cat(paste("; criterion = ", round(node$criterion$maxcriterion, 3), + ", statistic = ", round(max(node$criterion$statistic), 3), "\n", + collapse = "", sep = "")) + } + node +} + +### variable selection: +### conduct all M-fluctuation tests of fitted obj +### with respect to each variable from a set of +### potential partitioning variables in mf +mob_fit_fluctests <- function(obj, mf, minsplit, trim, breakties, parm) { + ## Cramer-von Mises statistic might be supported in future versions + CvM <- FALSE + + ## set up return values + partvar <- mf@get("part") + m <- NCOL(partvar) + pval <- rep.int(0, m) + stat <- rep.int(0, m) + ifac <- rep.int(FALSE, m) + + ## extract estimating functions + process <- as.matrix(estfun(obj)) + k <- NCOL(process) + + ## extract weights + ww <- weights(obj) + if(is.null(ww)) ww <- rep(1, NROW(process)) + n <- sum(ww) + + ## drop observations with zero weight + ww0 <- (ww > 0) + process <- process[ww0, , drop = FALSE] + partvar <- partvar[ww0, , drop = FALSE] + ww <- ww[ww0] + ## repeat observations with weight > 1 + process <- process/ww + ww1 <- rep.int(1:length(ww), ww) + process <- process[ww1, , drop = FALSE] + stopifnot(NROW(process) == n) + + ## scale process + process <- process/sqrt(n) + J12 <- root.matrix(crossprod(process)) + process <- t(chol2inv(chol(J12)) %*% t(process)) + + ## select parameters to test + if(!is.null(parm)) process <- process[, parm, drop = FALSE] + k <- NCOL(process) + + ## get critical values for CvM statistic + if(CvM) { + if(k > 25) k <- 25 #Z# also issue warning + critval <- get("sc.meanL2")[as.character(k), ] + } else { + from <- if(trim > 1) trim else ceiling(n * trim) + from <- max(from, minsplit) + to <- n - from + lambda <- ((n-from)*to)/(from*(n-to)) + + beta <- get("sc.beta.sup") + logp.supLM <- function(x, k, lambda) + { + if(k > 40) { + ## use Estrella (2003) asymptotic approximation + logp_estrella2003 <- function(x, k, lambda) + -lgamma(k/2) + k/2 * log(x/2) - x/2 + log(abs(log(lambda) * (1 - k/x) + 2/x)) + ## FIXME: Estrella only works well for large enough x + ## hence require x > 1.5 * k for Estrella approximation and + ## use an ad hoc interpolation for larger p-values + p <- ifelse(x <= 1.5 * k, (x/(1.5 * k))^sqrt(k) * logp_estrella2003(1.5 * k, k, lambda), logp_estrella2003(x, k, lambda)) + } else { + ## use Hansen (1997) approximation + m <- ncol(beta)-1 + if(lambda<1) tau <- lambda + else tau <- 1/(1+sqrt(lambda)) + beta <- beta[(((k-1)*25 +1):(k*25)),] + dummy <- beta[,(1:m)]%*%x^(0:(m-1)) + dummy <- dummy*(dummy>0) + pp <- pchisq(dummy, beta[,(m+1)], lower.tail = FALSE, log.p = TRUE) + if(tau==0.5) + p <- pchisq(x, k, lower.tail = FALSE, log.p = TRUE) + else if(tau <= 0.01) + p <- pp[25] + else if(tau >= 0.49) + p <- log((exp(log(0.5-tau) + pp[1]) + exp(log(tau-0.49) + pchisq(x,k,lower.tail = FALSE, log.p = TRUE)))*100) + else + { + taua <- (0.51-tau)*50 + tau1 <- floor(taua) + p <- log(exp(log(tau1 + 1 - taua) + pp[tau1]) + exp(log(taua-tau1) + pp[tau1+1])) + } + } + return(as.vector(p)) + } + } + + ## compute statistic and p-value for each ordering + for(i in 1:m) { + pvi <- partvar[,i] + pvi <- pvi[ww1] + if(is.factor(pvi)) { + proci <- process[ORDER(pvi), , drop = FALSE] + ifac[i] <- TRUE + + # re-apply factor() added to drop unused levels + pvi <- factor(pvi[ORDER(pvi)]) + # compute segment weights + segweights <- as.vector(table(pvi))/n ## tapply(ww, pvi, sum)/n + + # compute statistic only if at least two levels are left + if(length(segweights) < 2) { + stat[i] <- 0 + pval[i] <- NA + } else { + stat[i] <- sum(sapply(1:k, function(j) (tapply(proci[,j], pvi, sum)^2)/segweights)) + pval[i] <- pchisq(stat[i], k*(length(levels(pvi))-1), log.p = TRUE, lower.tail = FALSE) + } + } else { + oi <- if(breakties) { + mm <- sort(unique(pvi)) + mm <- ifelse(length(mm) > 1, min(diff(mm))/10, 1) + ORDER(pvi + runif(length(pvi), min = -mm, max = +mm)) + } else { + ORDER(pvi) + } + proci <- process[oi, , drop = FALSE] + proci <- apply(proci, 2, cumsum) + stat[i] <- if(CvM) sum((proci)^2)/n + else if(from < to) { + xx <- rowSums(proci^2) + xx <- xx[from:to] + tt <- (from:to)/n + max(xx/(tt * (1-tt))) + } else { + 0 + } + pval[i] <- if(CvM) log(approx(c(0, critval), c(1, 1-as.numeric(names(critval))), stat[i], rule=2)$y) + else if(from < to) logp.supLM(stat[i], k, lambda) else NA + } + } + + ## select variable with minimal p-value + best <- which.min(pval) + if(length(best) < 1) best <- NA + rval <- list(pval = exp(pval), stat = stat, best = best) + names(rval$pval) <- names(partvar) + names(rval$stat) <- names(partvar) + if (!all(is.na(rval$best))) + names(rval$best) <- names(partvar)[rval$best] + return(rval) +} + +### split in variable x, either ordered or nominal +mob_fit_splitnode <- function(x, obj, mf, weights, minsplit, objfun, verbose = TRUE) { + + ## process minsplit (to minimal number of observations) + if (minsplit > 0.5 & minsplit < 1) minsplit <- 1 - minsplit + if (minsplit < 0.5) + minsplit <- ceiling(sum(weights) * minsplit) + + if (is.numeric(x)) { + ### for numerical variables + ux <- sort(unique(x)) + if (length(ux) == 0) stop("cannot find admissible split point in x") + dev <- vector(mode = "numeric", length = length(ux)) + + for (i in 1:length(ux)) { + xs <- x <= ux[i] + if (mob_fit_checksplit(xs, weights, minsplit)) { + dev[i] <- Inf + } else { + dev[i] <- mob_fit_getobjfun(obj, mf, weights, xs, objfun = objfun) + } + } + + ## maybe none of the possible splits is admissible + if (all(!is.finite(dev))) return(FALSE) + + split <- list(variableID = NULL, ordered = TRUE, + splitpoint = as.double(ux[which.min(dev)]), + splitstatistic = dev, toleft = TRUE) + class(split) <- "orderedSplit" + } else { + ### for categorical variables + al <- mob_fit_getlevels(x) + dev <- apply(al, 1, function(w) { + xs <- x %in% levels(x)[w] + if (mob_fit_checksplit(xs, weights, minsplit)) { + return(Inf) + } else { + mob_fit_getobjfun(obj, mf, weights, xs, objfun = objfun) + } + }) + + if (verbose) { + cat(paste("\nSplitting ", if(is.ordered(x)) "ordered ", + "factor variable, objective function: \n", sep = "")) + print(dev) + } + + if (all(!is.finite(dev))) return(FALSE) + + ## ordered factors are of storage mode "numeric" in party! + ## initVariableFrame coerces ordered factors to storage.mode "numeric" + ## the following is consistent with party + + if (is.ordered(x)) { + split <- list(variableID = NULL, ordered = TRUE, + splitpoint = as.double(which.min(dev)), + splitstatistic = dev, toleft = TRUE) + class(split) <- "orderedSplit" + attr(split$splitpoint, "levels") <- levels(x) + } else { + tab <- as.integer(table(x[weights > 0]) > 0) + split <- list(variableID = NULL, ordered = FALSE, + splitpoint = as.integer(al[which.min(dev),]), + splitstatistic = dev, + toleft = TRUE, table = tab) + attr(split$splitpoint, "levels") <- levels(x) + class(split) <- "nominalSplit" + } + } + split +} + +### get partitioned objective function for a particular split +mob_fit_getobjfun <- function(obj, mf, weights, left, objfun = deviance) { + ## mf is the model frame + ## weights are the observation weights + ## left is 1 (if left of splitpoint) or 0 + weightsleft <- weights * left + weightsright <- weights * (1 - left) + + ### fit left / right model + fmleft <- reweight(obj, weights = weightsleft) + fmright <- reweight(obj, weights = weightsright) + + return(objfun(fmleft) + objfun(fmright)) +} + +### determine all possible splits for a factor, both nominal and ordinal +mob_fit_getlevels <- function(x) { + nl <- nlevels(x) + if (inherits(x, "ordered")) { + indx <- diag(nl) + indx[lower.tri(indx)] <- 1 + indx <- indx[-nl,] + rownames(indx) <- levels(x)[-nl] + } else { + mi <- 2^(nl - 1) - 1 + indx <- matrix(0, nrow = mi, ncol = nl) + for (i in 1:mi) { # go though all splits # + ii <- i + for (l in 1:nl) { + indx[i, l] <- ii%%2; + ii <- ii %/% 2 + } + } + rownames(indx) <- apply(indx, 1, function(z) paste(levels(x)[z > 0], collapse = "+")) + } + colnames(indx) <- as.character(levels(x)) + storage.mode(indx) <- "logical" + indx +} + +### check split +mob_fit_checksplit <- function(split, weights, minsplit) + (sum(split * weights) < minsplit || sum((1 - split) * weights) < minsplit)