|
a |
|
b/partyMod/man/ctree.Rd |
|
|
1 |
\name{Conditional Inference Trees} |
|
|
2 |
\alias{ctree} |
|
|
3 |
\alias{conditionalTree} |
|
|
4 |
\title{ Conditional Inference Trees } |
|
|
5 |
\description{ |
|
|
6 |
Recursive partitioning for continuous, censored, ordered, nominal and |
|
|
7 |
multivariate response variables in a conditional inference framework. |
|
|
8 |
} |
|
|
9 |
\usage{ |
|
|
10 |
ctree(formula, data, subset = NULL, weights = NULL, |
|
|
11 |
controls = ctree_control(), xtrafo = ptrafo, ytrafo = ptrafo, |
|
|
12 |
scores = NULL) |
|
|
13 |
} |
|
|
14 |
\arguments{ |
|
|
15 |
\item{formula}{ a symbolic description of the model to be fit. Note |
|
|
16 |
that symbols like \code{:} and \code{-} will not work |
|
|
17 |
and the tree will make use of all variables listed on the |
|
|
18 |
rhs of \code{formula}.} |
|
|
19 |
\item{data}{ a data frame containing the variables in the model. } |
|
|
20 |
\item{subset}{ an optional vector specifying a subset of observations to be |
|
|
21 |
used in the fitting process.} |
|
|
22 |
\item{weights}{ an optional vector of weights to be used in the fitting |
|
|
23 |
process. Only non-negative integer valued weights are |
|
|
24 |
allowed.} |
|
|
25 |
\item{controls}{an object of class \code{\link{TreeControl}}, which can be |
|
|
26 |
obtained using \code{\link{ctree_control}}.} |
|
|
27 |
\item{xtrafo}{ a function to be applied to all input variables. |
|
|
28 |
By default, the \code{\link{ptrafo}} function is applied.} |
|
|
29 |
\item{ytrafo}{ a function to be applied to all response variables. |
|
|
30 |
By default, the \code{\link{ptrafo}} function is applied.} |
|
|
31 |
\item{scores}{ an optional named list of scores to be attached to ordered |
|
|
32 |
factors.} |
|
|
33 |
} |
|
|
34 |
\details{ |
|
|
35 |
|
|
|
36 |
Conditional inference trees estimate a regression relationship by binary recursive |
|
|
37 |
partitioning in a conditional inference framework. Roughly, the algorithm |
|
|
38 |
works as follows: 1) Test the global null hypothesis of independence between |
|
|
39 |
any of the input variables and the response (which may be multivariate as well). |
|
|
40 |
Stop if this hypothesis cannot be rejected. Otherwise select the input |
|
|
41 |
variable with strongest association to the resonse. This |
|
|
42 |
association is measured by a p-value corresponding to a test for the |
|
|
43 |
partial null hypothesis of a single input variable and the response. |
|
|
44 |
2) Implement a binary split in the selected input variable. |
|
|
45 |
3) Recursively repeate steps 1) and 2). |
|
|
46 |
|
|
|
47 |
The implementation utilizes a unified framework for conditional inference, |
|
|
48 |
or permutation tests, developed by Strasser and Weber (1999). The stop |
|
|
49 |
criterion in step 1) is either based on multiplicity adjusted p-values |
|
|
50 |
(\code{testtype == "Bonferroni"} |
|
|
51 |
or \code{testtype == "MonteCarlo"} in \code{\link{ctree_control}}), |
|
|
52 |
on the univariate p-values (\code{testtype == "Univariate"}), |
|
|
53 |
or on values of the test statistic |
|
|
54 |
(\code{testtype == "Teststatistic"}). In both cases, the |
|
|
55 |
criterion is maximized, i.e., 1 - p-value is used. A split is implemented |
|
|
56 |
when the criterion exceeds the value given by \code{mincriterion} as |
|
|
57 |
specified in \code{\link{ctree_control}}. For example, when |
|
|
58 |
\code{mincriterion = 0.95}, the p-value must be smaller than |
|
|
59 |
$0.05$ in order to split this node. This statistical approach ensures that |
|
|
60 |
the right sized tree is grown and no form of pruning or cross-validation |
|
|
61 |
or whatsoever is needed. The selection of the input variable to split in |
|
|
62 |
is based on the univariate p-values avoiding a variable selection bias |
|
|
63 |
towards input variables with many possible cutpoints. |
|
|
64 |
|
|
|
65 |
Multiplicity-adjusted Monte-Carlo p-values are computed |
|
|
66 |
following a "min-p" approach. The univariate |
|
|
67 |
p-values based on the limiting distribution (chi-square |
|
|
68 |
or normal) are computed for each of the random |
|
|
69 |
permutations of the data. This means that one should |
|
|
70 |
use a quadratic test statistic when factors are in |
|
|
71 |
play (because the evaluation of the corresponding |
|
|
72 |
multivariate normal distribution is time-consuming). |
|
|
73 |
|
|
|
74 |
By default, the scores for each ordinal factor \code{x} are |
|
|
75 |
\code{1:length(x)}, this may be changed using \code{scores = list(x = |
|
|
76 |
c(1,5,6))}, for example. |
|
|
77 |
|
|
|
78 |
Predictions can be computed using \code{\link{predict}} or |
|
|
79 |
\code{\link{treeresponse}}. The first function accepts arguments |
|
|
80 |
\code{type = c("response", "node", "prob")} where \code{type = "response"} |
|
|
81 |
returns predicted means, predicted classes or median predicted survival |
|
|
82 |
times, \code{type = "node"} returns terminal node IDs (identical to |
|
|
83 |
\code{\link{where}}) and \code{type = "prob"} gives more information about |
|
|
84 |
the conditional distribution of the response, i.e., class probabilities or |
|
|
85 |
predicted Kaplan-Meier curves and is identical to |
|
|
86 |
\code{\link{treeresponse}}. For observations with zero weights, |
|
|
87 |
predictions are computed from the fitted tree when \code{newdata = NULL}. |
|
|
88 |
|
|
|
89 |
For a general description of the methodology see Hothorn, Hornik and |
|
|
90 |
Zeileis (2006) and Hothorn, Hornik, van de Wiel and Zeileis (2006). |
|
|
91 |
Introductions for novices can be found in Strobl et al. (2009) and |
|
|
92 |
at \url{http://github.com/christophM/overview-ctrees.git}. |
|
|
93 |
|
|
|
94 |
} |
|
|
95 |
\value{ |
|
|
96 |
An object of class \code{\link{BinaryTree-class}}. |
|
|
97 |
} |
|
|
98 |
\references{ |
|
|
99 |
|
|
|
100 |
Helmut Strasser and Christian Weber (1999). On the asymptotic theory of permutation |
|
|
101 |
statistics. \emph{Mathematical Methods of Statistics}, \bold{8}, 220--250. |
|
|
102 |
|
|
|
103 |
Torsten Hothorn, Kurt Hornik, Mark A. van de Wiel and Achim Zeileis (2006). |
|
|
104 |
A Lego System for Conditional Inference. \emph{The American Statistician}, |
|
|
105 |
\bold{60}(3), 257--263. |
|
|
106 |
|
|
|
107 |
Torsten Hothorn, Kurt Hornik and Achim Zeileis (2006). Unbiased Recursive |
|
|
108 |
Partitioning: A Conditional Inference Framework. \emph{Journal of |
|
|
109 |
Computational and Graphical Statistics}, \bold{15}(3), 651--674. |
|
|
110 |
Preprint available |
|
|
111 |
from \url{http://statmath.wu-wien.ac.at/~zeileis/papers/Hothorn+Hornik+Zeileis-2006.pdf} |
|
|
112 |
|
|
|
113 |
Carolin Strobl, James Malley and Gerhard Tutz (2009). |
|
|
114 |
An Introduction to Recursive Partitioning: Rationale, Application, and Characteristics of |
|
|
115 |
Classification and Regression Trees, Bagging, and Random forests. |
|
|
116 |
\emph{Psychological Methods}, \bold{14}(4), 323--348. |
|
|
117 |
|
|
|
118 |
} |
|
|
119 |
\examples{ |
|
|
120 |
|
|
|
121 |
set.seed(290875) |
|
|
122 |
|
|
|
123 |
### regression |
|
|
124 |
airq <- subset(airquality, !is.na(Ozone)) |
|
|
125 |
airct <- ctree(Ozone ~ ., data = airq, |
|
|
126 |
controls = ctree_control(maxsurrogate = 3)) |
|
|
127 |
airct |
|
|
128 |
plot(airct) |
|
|
129 |
mean((airq$Ozone - predict(airct))^2) |
|
|
130 |
### extract terminal node ID, two ways |
|
|
131 |
all.equal(predict(airct, type = "node"), where(airct)) |
|
|
132 |
|
|
|
133 |
### classification |
|
|
134 |
irisct <- ctree(Species ~ .,data = iris) |
|
|
135 |
irisct |
|
|
136 |
plot(irisct) |
|
|
137 |
table(predict(irisct), iris$Species) |
|
|
138 |
|
|
|
139 |
### estimated class probabilities, a list |
|
|
140 |
tr <- treeresponse(irisct, newdata = iris[1:10,]) |
|
|
141 |
|
|
|
142 |
### ordinal regression |
|
|
143 |
data("mammoexp", package = "TH.data") |
|
|
144 |
mammoct <- ctree(ME ~ ., data = mammoexp) |
|
|
145 |
plot(mammoct) |
|
|
146 |
|
|
|
147 |
### estimated class probabilities |
|
|
148 |
treeresponse(mammoct, newdata = mammoexp[1:10,]) |
|
|
149 |
|
|
|
150 |
### survival analysis |
|
|
151 |
if (require("TH.data") && require("survival")) { |
|
|
152 |
data("GBSG2", package = "TH.data") |
|
|
153 |
GBSG2ct <- ctree(Surv(time, cens) ~ .,data = GBSG2) |
|
|
154 |
plot(GBSG2ct) |
|
|
155 |
treeresponse(GBSG2ct, newdata = GBSG2[1:2,]) |
|
|
156 |
} |
|
|
157 |
|
|
|
158 |
### if you are interested in the internals: |
|
|
159 |
### generate doxygen documentation |
|
|
160 |
\dontrun{ |
|
|
161 |
|
|
|
162 |
### download src package into temp dir |
|
|
163 |
tmpdir <- tempdir() |
|
|
164 |
tgz <- download.packages("party", destdir = tmpdir)[2] |
|
|
165 |
### extract |
|
|
166 |
untar(tgz, exdir = tmpdir) |
|
|
167 |
wd <- setwd(file.path(tmpdir, "party")) |
|
|
168 |
### run doxygen (assuming it is there) |
|
|
169 |
system("doxygen inst/doxygen.cfg") |
|
|
170 |
setwd(wd) |
|
|
171 |
### have fun |
|
|
172 |
browseURL(file.path(tmpdir, "party", "inst", |
|
|
173 |
"documentation", "html", "index.html")) |
|
|
174 |
} |
|
|
175 |
} |
|
|
176 |
\keyword{tree} |