atlantis / Git / [fbf06f] /partyMod/R/varimp.R

Models:
DanielG/
atlantis
Downloads: 1
[fbf06f]: / partyMod / R / varimp.R
History
Download this file
379 lines (321 with data), 15.7 kB

# for the current variable of interest, xname,
# create the list of variables to condition on:

create_cond_list <- function(cond, threshold, xname, input) {

   stopifnot(is.logical(cond))
   if (!cond) return(NULL)
   if (threshold > 0 & threshold < 1) {
           ctrl <- ctree_control(teststat = "quad", testtype = "Univariate", stump = TRUE)
           xnames <- names(input)
           xnames <- xnames[xnames != xname]
           ct <- ctree(as.formula(paste(xname, "~", paste(xnames, collapse = "+"), collapse = "")),
                       data = input, controls = ctrl)
           crit <- ct@tree$criterion[[2]]
           crit[which(is.na(crit))] <- 0
           return(xnames[crit > threshold])
       }
   stop()
}



## mincriterion = 0 so that complete tree is evaluated; 
## regulate size of considered tree here via, e.g., mincriterion = 0.95
## or when building the forest in the first place via cforest_control(mincriterion = 0.95)

varimp <- function (object, mincriterion = 0, conditional = FALSE, 
                    threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional)
{

    response <- object@responses
    if (length(response@variables) == 1 && 
        inherits(response@variables[[1]], "Surv"))
        return(varimpsurv(object, mincriterion, conditional, threshold, nperm, OOB, pre1.0_0))
    input <- object@data@get("input")
    xnames <- colnames(input)
    inp <- initVariableFrame(input, trafo = NULL)
    y <- object@responses@variables[[1]]
    if(length(response@variables) != 1)
        stop("cannot compute variable importance measure for multivariate response")

    if (conditional || pre1.0_0) {
        if(!all(complete.cases(inp@variables)))
            stop("cannot compute variable importance measure with missing values")
    }
    CLASS <- all(response@is_nominal)
    ORDERED <- all(response@is_ordinal)
    if (CLASS) {
        error <- function(x, oob) mean((levels(y)[sapply(x, which.max)] != 
            y)[oob])
    }
    else {
        if (ORDERED) {
            error <- function(x, oob) mean((sapply(x, which.max) != 
                y)[oob])
        }
        else {
            error <- function(x, oob) mean((unlist(x) - y)[oob]^2)
        }
    }

    w <- object@initweights
    if (max(abs(w - 1)) > sqrt(.Machine$double.eps))
        warning(sQuote("varimp"), " with non-unity weights might give misleading results")

    ## list for several permutations
    perror <- matrix(0, nrow = nperm*length(object@ensemble), ncol = length(xnames))
    ## this matrix is initialized with values 0 so that a tree that does not 
    ## contain the current variable adds importance 0 to its average importance
    colnames(perror) <- xnames
        for (b in 1:length(object@ensemble)){
            tree <- object@expand(object@ensemble[[b]])

            ## if OOB == TRUE use only oob observations, otherwise use all observations in learning sample
            if(OOB){oob <- object@weights[[b]] == 0} else{ oob <- rep(TRUE, length(y))}
            p <- .Call("R_predict", tree, inp, mincriterion, -1L, PACKAGE = "atlantisPartyMod")
            eoob <- error(p, oob)

            ## for all variables (j = 1 ... number of variables) 
            for(j in unique(varIDs(tree))){
              for (per in 1:nperm){

                if (conditional || pre1.0_0) {
                    tmp <- inp
                    ccl <- create_cond_list(conditional, threshold, xnames[j], input)
                    if (is.null(ccl)) {
                        perm <- sample(which(oob))
                    } else {
                        perm <- conditional_perm(ccl, xnames, input, tree, oob)
                    }
                    tmp@variables[[j]][which(oob)] <- tmp@variables[[j]][perm]
                    p <- .Call("R_predict", tree, tmp, mincriterion, -1L,
                       PACKAGE = "atlantisPartyMod")
                } else {
                    p <- .Call("R_predict", tree, inp, mincriterion, as.integer(j),
                               PACKAGE = "atlantisPartyMod")
                }
                ## run through all rows of perror
                perror[(per+(b-1)*nperm), j] <- (error(p, oob) - eoob)

              } ## end of for (per in 1:nperm)
            } ## end of for(j in unique(varIDs(tree)))
        } ## end of for (b in 1:length(object@ensemble))

    perror <- as.data.frame(perror)
    #return(MeanDecreaseAccuracy = perror) ## return the whole matrix (= nperm*ntree values per variable)
    return(MeanDecreaseAccuracy = colMeans(perror)) ## return only averages over permutations and trees
}


varimpsurv <- function (object, mincriterion = 0, conditional = FALSE, 
                        threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional)
{

    cat("\n")
    cat("Variable importance for survival forests; this feature is _experimental_\n\n")
    response <- object@responses
    input <- object@data@get("input")
    xnames <- colnames(input)
    inp <- initVariableFrame(input, trafo = NULL)
    y <- object@responses@variables[[1]]
    weights <- object@initweights
    stopifnot(inherits(y, "Surv"))

    if (conditional || pre1.0_0) {
        if(!all(complete.cases(inp@variables)))
            stop("cannot compute variable importance measure with missing values")
    }
    stopifnot(require("ipred", quietly = TRUE))
    error <- function(x, oob) sbrier(y[oob,,drop = FALSE], x[oob])

    pred <- function(tree, newinp, j = -1L) {

        where <- R_get_nodeID(tree, inp, mincriterion)
        wh <- .Call("R_get_nodeID", tree, newinp, mincriterion, as.integer(j), PACKAGE = "atlantisPartyMod")
        swh <- sort(unique(wh))
        RET <- vector(mode = "list", length = length(wh))
        for (i in 1:length(swh)) {
            w <- weights * (where == swh[i])
            RET[wh == swh[i]] <- list(mysurvfit(y, weights = w))
        }
        return(RET)
    }

    w <- object@initweights
    if (max(abs(w - 1)) > sqrt(.Machine$double.eps))
        warning(sQuote("varimp"), " with non-unity weights might give misleading results")

    ## list for several permutations
    perror <- matrix(0, nrow = nperm*length(object@ensemble), ncol = length(xnames))
    ## this matrix is initialized with values 0 so that a tree that does not 
    ## contain the current variable adds importance 0 to its average importance
    colnames(perror) <- xnames
        for (b in 1:length(object@ensemble)){
            tree <- object@ensemble[[b]]


            ## if OOB == TRUE use only oob observations, otherwise use all observations in learning sample
            if(OOB){oob <- object@weights[[b]] == 0} else{ oob <- rep(TRUE, length(y))}
            p <- pred(tree, inp)
            eoob <- error(p, oob)

            ## for all variables (j = 1 ... number of variables) 
            for(j in unique(varIDs(tree))){
              for (per in 1:nperm){
                 if (conditional || pre1.0_0) {
                    tmp <- inp
                    ccl <- create_cond_list(conditional, threshold, xnames[j], input)
                    if (is.null(ccl)) {
                        perm <- sample(which(oob))
                    } else {
                        perm <- conditional_perm(ccl, xnames, input, tree, oob)
                    }
                    tmp@variables[[j]][which(oob)] <- tmp@variables[[j]][perm]
                    p <- pred(tree, tmp, -1L)
                } else {
                    p <- pred(tree, inp, as.integer(j))
                }

                ## run through all rows of perror
                perror[(per+(b-1)*nperm), j] <- (error(p, oob) - eoob)

              } ## end of for (per in 1:nperm)
            } ## end of for(j in unique(varIDs(tree)))
        } ## end of for (b in 1:length(object@ensemble))

    perror <- as.data.frame(perror)
    #return(MeanDecreaseAccuracy = perror) ## return the whole matrix (= nperm*ntree values per variable)
    return(MeanDecreaseAccuracy = colMeans(perror)) ## return only averages over permutations and trees
}




# cutpoints_list() returns:
# - vector of cutpoints (length=number of cutpoints) 
#   if variable is continuous
# - vector of indicators (length=number of categories x number of cutpoints)
#   if variable is categorical (nominal or ordered)
cutpoints_list <- function(tree, variableID) {

    cutp <- function(node) {
       if (node[[4]]) return(NULL)
       cp <- NULL
       if (node[[5]][[1]] == variableID)
           cp <- node[[5]][[3]]
       nl <- cutp(node[[8]])
       nr <- cutp(node[[9]])
       return(c(cp, nl, nr))
    }
    return(cutp(tree))
}


conditional_perm <- function(cond, xnames, input, tree, oob){

    ## get cutpoints of all conditioning variables of the current variable of interest 
    ## and generate design matrix for permutation from factors in help
    blocks <- vector(mode = "list", length = length(cond))
                    
    for (i in 1:length(cond)) {

        ## varID is variable index or column number of input (predictor matrix) 
        ## not variable name!
        varID <- which(xnames == cond[i])


        ## if conditioning variable is not used for splitting in current tree
        ## proceed with next conditioning variable
        cl <- cutpoints_list(tree, varID)
        if (is.null(cl)) next

        ## proceed cutpoints for different types of variables
        x <- input[, varID]
        xclass <- class(x)[1]
        if (xclass == "integer") xclass <- "numeric"

        block <- switch(xclass, "numeric" = cut(x, breaks = c(-Inf, sort(unique(cl)), Inf)),
                        "ordered" = cut(as.numeric(x), breaks =  c(-Inf, sort(unique(cl)), Inf)),
                        "factor" = {
                            CL <- matrix(as.logical(cl), nrow = nlevels(x))                            
                            rs <- rowSums(CL)
                            dlev <- (1:nrow(CL))[rs %in% rs[duplicated(rs)]]
                            fuse <- c()
                            for (ii in dlev) {
                                for (j in dlev[dlev > ii]) {
                                    if (all(CL[ii,] == CL[j,])) fuse <- rbind(fuse, c(ii, j))
                                }
                            }
                            xlev <- 1:nlevels(x)
                            newl <- nlevels(x) + 1
                            block <- as.integer(x)
                            for (l in xlev) {
                                if (NROW(fuse) == 0) break
                                if (any(fuse[, 1] == l)) {
                                    f <- c(l, fuse[fuse[, 1] == l, 2])
                                    fuse <- fuse[!fuse[,1] %in% f, , drop = FALSE]
                                    block[block %in% f] <- newl
                                    newl <- newl + 1
                                 }
                            }
                            as.factor(block)
                         })
         blocks[[i]] <- block
    }

    ## remove non-splitting variables
    names(blocks) <- cond
    blocks <- blocks[!sapply(blocks, is.null)]

    ## if none of the conditioning variables are used in the tree
    if (!length(blocks)>0){
        perm <- sample(which(oob))
        return(perm)
    } else {
        blocks <- as.data.frame(blocks)
        ## from factors blocks create design matrix
        f <- paste("~ - 1 + ", paste(colnames(blocks), collapse = ":", sep = ""))
        des <- model.matrix(as.formula(f), data = blocks)

        ## one conditional permutation
        perm <- 1:nrow(input)
        for (l in 1:ncol(des)) {
           index <- which(des[,l] > 0 & oob)
           if (length(index) > 1)
               perm[index] <- sample(index)
           }
        return(perm[oob])
    }
}

varimpAUC <- function(object, mincriterion = 0, conditional = FALSE, 
                      threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional)
{

    response <- object@responses
    input <- object@data@get("input")
    xnames <- colnames(input)
    inp <- initVariableFrame(input, trafo = NULL)
    y <- object@responses@variables[[1]]
    if(length(response@variables) != 1)
        stop("cannot compute variable importance measure for multivariate response")

    if (conditional || pre1.0_0) {
        if(!all(complete.cases(inp@variables)))
            stop("cannot compute variable importance measure with missing values")
    }
    CLASS <- all(response@is_nominal)
    ORDERED <- all(response@is_ordinal)
    if (CLASS) {      
          if (nlevels(y)>2) {
            warning("AUC=TRUE works only for binary y\n error rate is used instead of AUC")
            error <- function(x, oob) mean((levels(y)[sapply(x, which.max)] != y)[oob])
          }   
          else {
             error <- function(x, oob) {
               xoob <- sapply(x, function(x) x[1])[oob]
               yoob <- y[oob]
               which1 <- which(yoob==levels(y)[1])
               noob1 <- length(which1)
               noob <- length(yoob)
               if (noob1==0|noob1==noob) { return(NA) }       # AUC cannot be computed if all OOB-observations are from one class
               return(1-sum(kronecker(xoob[which1] , xoob[-which1],">"))/(noob1*(length(yoob)-noob1)))       # calculate AUC
            }
       }
       ###  stop
    }
    else {
        if (ORDERED) {
            error <- function(x, oob) mean((sapply(x, which.max) != 
                y)[oob])
        }
        else {
            error <- function(x, oob) mean((unlist(x) - y)[oob]^2)
        }
    }

    w <- object@initweights
    if (max(abs(w - 1)) > sqrt(.Machine$double.eps))
        warning(sQuote("varimp"), " with non-unity weights might give misleading results")

    perror <- matrix(0, nrow = nperm*length(object@ensemble), ncol = length(xnames))
    colnames(perror) <- xnames
        for (b in 1:length(object@ensemble)){
            tree <- object@ensemble[[b]]

            if(OOB){oob <- object@weights[[b]] == 0} else{ oob <- rep(TRUE, length(xnames))}
            p <- .Call("R_predict", tree, inp, mincriterion, -1L, PACKAGE = "atlantisPartyMod")
            eoob <- error(p, oob)

            for(j in unique(varIDs(tree))){
              for (per in 1:nperm){

                if (conditional || pre1.0_0) {
                    tmp <- inp
                    ccl <- create_cond_list(conditional, threshold, xnames[j], input)
                    if (is.null(ccl)) {
                        perm <- sample(which(oob))
                    } else {
                        perm <- conditional_perm(ccl, xnames, input, tree, oob)
                    }
                    tmp@variables[[j]][which(oob)] <- tmp@variables[[j]][perm]
                    p <- .Call("R_predict", tree, tmp, mincriterion, -1L,
                       PACKAGE = "atlantisPartyMod")
                } else {
                    p <- .Call("R_predict", tree, inp, mincriterion, as.integer(j),
                               PACKAGE = "atlantisPartyMod")
                }
                perror[(per+(b-1)*nperm), j] <- (error(p, oob) - eoob)

              } 
            } 
        } 

    perror <- as.data.frame(perror)
    return(MeanDecreaseAccuracy = colMeans(perror, na.rm = TRUE)) ## na.rm = TRUE because with AUC-perm. VIM NA values occur whenever a tree's OOB-observations are all from the same class
}