|
a |
|
b/partyMod/man/varimp.Rd |
|
|
1 |
\name{varimp} |
|
|
2 |
\alias{varimp} |
|
|
3 |
\alias{varimpAUC} |
|
|
4 |
\title{ Variable Importance } |
|
|
5 |
\description{ |
|
|
6 |
Standard and conditional variable importance for `cforest', following the permutation |
|
|
7 |
principle of the `mean decrease in accuracy' importance in `randomForest'. |
|
|
8 |
} |
|
|
9 |
\usage{ |
|
|
10 |
varimp(object, mincriterion = 0, conditional = FALSE, |
|
|
11 |
threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional) |
|
|
12 |
varimpAUC(object, mincriterion = 0, conditional = FALSE, |
|
|
13 |
threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional) |
|
|
14 |
} |
|
|
15 |
\arguments{ |
|
|
16 |
\item{object}{ an object as returned by \code{cforest}.} |
|
|
17 |
\item{mincriterion}{ the value of the test statistic or 1 - p-value that |
|
|
18 |
must be exceeded in order to include a split in the |
|
|
19 |
computation of the importance. The default \code{mincriterion = 0} |
|
|
20 |
guarantees that all splits are included.} |
|
|
21 |
\item{conditional}{ a logical determining whether unconditional or conditional |
|
|
22 |
computation of the importance is performed. } |
|
|
23 |
\item{threshold}{ the value of the test statistic or 1 - p-value of the association |
|
|
24 |
between the variable of interest and a covariate that must be |
|
|
25 |
exceeded inorder to include the covariate in the conditioning |
|
|
26 |
scheme for the variable of interest (only relevant if |
|
|
27 |
\code{conditional = TRUE}). } |
|
|
28 |
\item{nperm}{ the number of permutations performed.} |
|
|
29 |
\item{OOB}{ a logical determining whether the importance is computed from the out-of-bag |
|
|
30 |
sample or the learning sample (not suggested).} |
|
|
31 |
\item{pre1.0_0}{ Prior to party version 1.0-0, the actual data values |
|
|
32 |
were permuted according to the original permutation |
|
|
33 |
importance suggested by Breiman (2001). Now the assignments |
|
|
34 |
to child nodes of splits in the variable of interest |
|
|
35 |
are permuted as described by Hapfelmeier et al. (2012), |
|
|
36 |
which allows for missing values in the explanatory |
|
|
37 |
variables and is more efficient wrt memory consumption and |
|
|
38 |
computing time. This method does not apply to conditional |
|
|
39 |
variable importances.} |
|
|
40 |
} |
|
|
41 |
\details{ |
|
|
42 |
|
|
|
43 |
Function \code{varimp} can be used to compute variable importance measures |
|
|
44 |
similar to those computed by \code{\link[randomForest]{importance}}. Besides the |
|
|
45 |
standard version, a conditional version is available, that adjusts for correlations between |
|
|
46 |
predictor variables. |
|
|
47 |
|
|
|
48 |
If \code{conditional = TRUE}, the importance of each variable is computed by permuting |
|
|
49 |
within a grid defined by the covariates that are associated (with 1 - p-value |
|
|
50 |
greater than \code{threshold}) to the variable of interest. |
|
|
51 |
The resulting variable importance score is conditional in the sense of beta coefficients in |
|
|
52 |
regression models, but represents the effect of a variable in both main effects and interactions. |
|
|
53 |
See Strobl et al. (2008) for details. |
|
|
54 |
|
|
|
55 |
Note, however, that all random forest results are subject to random variation. Thus, before |
|
|
56 |
interpreting the importance ranking, check whether the same ranking is achieved with a |
|
|
57 |
different random seed -- or otherwise increase the number of trees \code{ntree} in |
|
|
58 |
\code{\link{ctree_control}}. |
|
|
59 |
|
|
|
60 |
Note that in the presence of missings in the predictor variables the procedure |
|
|
61 |
described in Hapfelmeier et al. (2012) is performed. |
|
|
62 |
|
|
|
63 |
Function \code{varimpAUC} implements AUC-based variables importances as |
|
|
64 |
described by Janitza et al. (2012). Here, the area under the curve |
|
|
65 |
instead of the accuracy is used to calculate the importance of each variable. |
|
|
66 |
This AUC-based variable importance measure is more robust towards class imbalance. |
|
|
67 |
|
|
|
68 |
For right-censored responses, \code{varimp} uses the integrated Brier score as a |
|
|
69 |
risk measure for computing variable importances. This feature is extremely slow and |
|
|
70 |
experimental; use at your own risk. |
|
|
71 |
|
|
|
72 |
} |
|
|
73 |
\value{ |
|
|
74 |
A vector of `mean decrease in accuracy' importance scores. |
|
|
75 |
} |
|
|
76 |
\references{ |
|
|
77 |
|
|
|
78 |
Leo Breiman (2001). Random Forests. \emph{Machine Learning}, 45(1), 5--32. |
|
|
79 |
|
|
|
80 |
Alexander Hapfelmeier, Torsten Hothorn, Kurt Ulm, and Carolin Strobl (2012). |
|
|
81 |
A New Variable Importance Measure for Random Forests with Missing Data. |
|
|
82 |
\emph{Statistics and Computing}, \url{http://dx.doi.org/10.1007/s11222-012-9349-1} |
|
|
83 |
|
|
|
84 |
Torsten Hothorn, Kurt Hornik, and Achim Zeileis (2006b). Unbiased |
|
|
85 |
Recursive Partitioning: A Conditional Inference Framework. |
|
|
86 |
\emph{Journal of Computational and Graphical Statistics}, \bold{15} (3), |
|
|
87 |
651-674. Preprint available from |
|
|
88 |
\url{http://statmath.wu-wien.ac.at/~zeileis/papers/Hothorn+Hornik+Zeileis-2006.pdf} |
|
|
89 |
|
|
|
90 |
Silke Janitza, Carolin Strobl and Anne-Laure Boulesteix (2013). An AUC-based Permutation |
|
|
91 |
Variable Importance Measure for Random Forests. BMC Bioinformatics.2013, \bold{14} 119. |
|
|
92 |
\url{http://www.biomedcentral.com/1471-2105/14/119} |
|
|
93 |
|
|
|
94 |
Carolin Strobl, Anne-Laure Boulesteix, Thomas Kneib, Thomas Augustin, and Achim Zeileis (2008). |
|
|
95 |
Conditional Variable Importance for Random Forests. \emph{BMC Bioinformatics}, \bold{9}, 307. |
|
|
96 |
\url{http://www.biomedcentral.com/1471-2105/9/307} |
|
|
97 |
} |
|
|
98 |
\examples{ |
|
|
99 |
|
|
|
100 |
set.seed(290875) |
|
|
101 |
readingSkills.cf <- cforest(score ~ ., data = readingSkills, |
|
|
102 |
control = cforest_unbiased(mtry = 2, ntree = 50)) |
|
|
103 |
|
|
|
104 |
# standard importance |
|
|
105 |
varimp(readingSkills.cf) |
|
|
106 |
# the same modulo random variation |
|
|
107 |
varimp(readingSkills.cf, pre1.0_0 = TRUE) |
|
|
108 |
|
|
|
109 |
# conditional importance, may take a while... |
|
|
110 |
varimp(readingSkills.cf, conditional = TRUE) |
|
|
111 |
|
|
|
112 |
\dontrun{ |
|
|
113 |
data("GBSG2", package = "TH.data") |
|
|
114 |
### add a random covariate for sanity check |
|
|
115 |
set.seed(29) |
|
|
116 |
GBSG2$rand <- runif(nrow(GBSG2)) |
|
|
117 |
object <- cforest(Surv(time, cens) ~ ., data = GBSG2, |
|
|
118 |
control = cforest_unbiased(ntree = 20)) |
|
|
119 |
vi <- varimp(object) |
|
|
120 |
### compare variable importances and absolute z-statistics |
|
|
121 |
layout(matrix(1:2)) |
|
|
122 |
barplot(vi) |
|
|
123 |
barplot(abs(summary(coxph(Surv(time, cens) ~ ., data = GBSG2))$coeff[,"z"])) |
|
|
124 |
### looks more or less the same |
|
|
125 |
} |
|
|
126 |
} |
|
|
127 |
\keyword{tree} |