Diff of /partyMod/man/varimp.Rd [000000] .. [fbf06f]

Switch to unified view

a b/partyMod/man/varimp.Rd
1
\name{varimp}
2
\alias{varimp}
3
\alias{varimpAUC}
4
\title{ Variable Importance }
5
\description{
6
    Standard and conditional variable importance for `cforest', following the permutation
7
    principle of the `mean decrease in accuracy' importance in `randomForest'.
8
}
9
\usage{
10
varimp(object, mincriterion = 0, conditional = FALSE, 
11
       threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional)
12
varimpAUC(object, mincriterion = 0, conditional = FALSE, 
13
       threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional)
14
}
15
\arguments{
16
  \item{object}{ an object as returned by \code{cforest}.}
17
  \item{mincriterion}{ the value of the test statistic or 1 - p-value that
18
                       must be exceeded in order to include a split in the 
19
                       computation of the importance. The default \code{mincriterion = 0}
20
                       guarantees that all splits are included.}
21
  \item{conditional}{ a logical determining whether unconditional or conditional 
22
                      computation of the importance is performed. }
23
  \item{threshold}{ the value of the test statistic or 1 - p-value of the association 
24
                    between the variable of interest and a covariate that must be 
25
                    exceeded inorder to include the covariate in the conditioning 
26
                    scheme for the variable of interest (only relevant if 
27
                    \code{conditional = TRUE}). }
28
  \item{nperm}{ the number of permutations performed.}
29
  \item{OOB}{ a logical determining whether the importance is computed from the out-of-bag 
30
              sample or the learning sample (not suggested).}
31
  \item{pre1.0_0}{ Prior to party version 1.0-0, the actual data values
32
                   were permuted according to the original permutation
33
                   importance suggested by Breiman (2001). Now the assignments
34
                   to child nodes of splits in the variable of interest
35
                   are permuted as described by Hapfelmeier et al. (2012),
36
                   which allows for missing values in the explanatory
37
                   variables and is more efficient wrt memory consumption and 
38
                   computing time. This method does not apply to conditional
39
                   variable importances.}
40
}
41
\details{
42
43
  Function \code{varimp} can be used to compute variable importance measures
44
  similar to those computed by \code{\link[randomForest]{importance}}. Besides the
45
  standard version, a conditional version is available, that adjusts for correlations between
46
  predictor variables. 
47
  
48
  If \code{conditional = TRUE}, the importance of each variable is computed by permuting 
49
  within a grid defined by the covariates that are associated  (with 1 - p-value 
50
  greater than \code{threshold}) to the variable of interest.
51
  The resulting variable importance score is conditional in the sense of beta coefficients in   
52
  regression models, but represents the effect of a variable in both main effects and interactions.
53
  See Strobl et al. (2008) for details.
54
55
  Note, however, that all random forest results are subject to random variation. Thus, before
56
  interpreting the importance ranking, check whether the same ranking is achieved with a
57
  different random seed -- or otherwise increase the number of trees \code{ntree} in 
58
  \code{\link{ctree_control}}.
59
60
  Note that in the presence of missings in the predictor variables the procedure
61
  described in Hapfelmeier et al. (2012) is performed.
62
63
  Function \code{varimpAUC} implements AUC-based variables importances as
64
  described by Janitza et al. (2012).  Here, the area under the curve
65
  instead of the accuracy is used to calculate the importance of each variable. 
66
  This AUC-based variable importance measure is more robust towards class imbalance.
67
68
  For right-censored responses, \code{varimp} uses the integrated Brier score as a 
69
  risk measure for computing variable importances. This feature is extremely slow and
70
  experimental; use at your own risk.
71
72
}
73
\value{
74
  A vector of `mean decrease in accuracy' importance scores.
75
}
76
\references{ 
77
78
    Leo Breiman (2001). Random Forests. \emph{Machine Learning}, 45(1), 5--32.
79
80
    Alexander Hapfelmeier, Torsten Hothorn, Kurt Ulm, and Carolin Strobl (2012).
81
    A New Variable Importance Measure for Random Forests with Missing Data.
82
    \emph{Statistics and Computing}, \url{http://dx.doi.org/10.1007/s11222-012-9349-1}
83
84
    Torsten Hothorn, Kurt Hornik, and Achim Zeileis (2006b). Unbiased
85
    Recursive Partitioning: A Conditional Inference Framework.
86
    \emph{Journal of Computational and Graphical Statistics}, \bold{15} (3),
87
    651-674.  Preprint available from 
88
    \url{http://statmath.wu-wien.ac.at/~zeileis/papers/Hothorn+Hornik+Zeileis-2006.pdf}
89
90
    Silke Janitza, Carolin Strobl and Anne-Laure Boulesteix (2013). An AUC-based Permutation 
91
    Variable Importance Measure for Random Forests. BMC Bioinformatics.2013, \bold{14} 119.
92
    \url{http://www.biomedcentral.com/1471-2105/14/119}
93
94
    Carolin Strobl, Anne-Laure Boulesteix, Thomas Kneib, Thomas Augustin, and Achim Zeileis (2008).
95
    Conditional Variable Importance for Random Forests. \emph{BMC Bioinformatics}, \bold{9}, 307. 
96
    \url{http://www.biomedcentral.com/1471-2105/9/307}
97
}
98
\examples{
99
    
100
   set.seed(290875)
101
   readingSkills.cf <- cforest(score ~ ., data = readingSkills, 
102
       control = cforest_unbiased(mtry = 2, ntree = 50))
103
104
   # standard importance
105
   varimp(readingSkills.cf)
106
   # the same modulo random variation
107
   varimp(readingSkills.cf, pre1.0_0 = TRUE)
108
109
   # conditional importance, may take a while...
110
   varimp(readingSkills.cf, conditional = TRUE)
111
112
   \dontrun{
113
   data("GBSG2", package = "TH.data")
114
   ### add a random covariate for sanity check
115
   set.seed(29)
116
   GBSG2$rand <- runif(nrow(GBSG2))
117
   object <- cforest(Surv(time, cens) ~ ., data = GBSG2, 
118
                     control = cforest_unbiased(ntree = 20)) 
119
   vi <- varimp(object)
120
   ### compare variable importances and absolute z-statistics
121
   layout(matrix(1:2))
122
   barplot(vi)
123
   barplot(abs(summary(coxph(Surv(time, cens) ~ ., data = GBSG2))$coeff[,"z"]))
124
   ### looks more or less the same
125
   }
126
}
127
\keyword{tree}