|
a |
|
b/partyMod/vignettes/party.Rnw |
|
|
1 |
\documentclass[nojss]{jss} |
|
|
2 |
|
|
|
3 |
%\VignetteIndexEntry{party: A Laboratory for Recursive Partytioning} |
|
|
4 |
%\VignetteDepends{coin, TH.data} |
|
|
5 |
%\VignetteKeywords{conditional inference, non-parametric models, recursive partitioning} |
|
|
6 |
%\VignettePackage{party} |
|
|
7 |
|
|
|
8 |
%% packages |
|
|
9 |
\usepackage{amstext} |
|
|
10 |
\usepackage{amsfonts} |
|
|
11 |
\usepackage{amsmath} |
|
|
12 |
\usepackage{thumbpdf} |
|
|
13 |
\usepackage{rotating} |
|
|
14 |
%% need no \usepackage{Sweave} |
|
|
15 |
|
|
|
16 |
%% commands |
|
|
17 |
\renewcommand{\Prob}{\mathbb{P} } |
|
|
18 |
\renewcommand{\E}{\mathbb{E}} |
|
|
19 |
\newcommand{\V}{\mathbb{V}} |
|
|
20 |
\newcommand{\Var}{\mathbb{V}} |
|
|
21 |
\newcommand{\R}{\mathbb{R} } |
|
|
22 |
\newcommand{\N}{\mathbb{N} } |
|
|
23 |
\newcommand{\C}{\mathbb{C} } |
|
|
24 |
\newcommand{\argmin}{\operatorname{argmin}\displaylimits} |
|
|
25 |
\newcommand{\argmax}{\operatorname{argmax}\displaylimits} |
|
|
26 |
\newcommand{\LS}{\mathcal{L}_n} |
|
|
27 |
\newcommand{\TS}{\mathcal{T}_n} |
|
|
28 |
\newcommand{\LSc}{\mathcal{L}_{\text{comb},n}} |
|
|
29 |
\newcommand{\LSbc}{\mathcal{L}^*_{\text{comb},n}} |
|
|
30 |
\newcommand{\F}{\mathcal{F}} |
|
|
31 |
\newcommand{\A}{\mathcal{A}} |
|
|
32 |
\newcommand{\yn}{y_{\text{new}}} |
|
|
33 |
\newcommand{\z}{\mathbf{z}} |
|
|
34 |
\newcommand{\X}{\mathbf{X}} |
|
|
35 |
\newcommand{\Y}{\mathbf{Y}} |
|
|
36 |
\newcommand{\sX}{\mathcal{X}} |
|
|
37 |
\newcommand{\sY}{\mathcal{Y}} |
|
|
38 |
\newcommand{\T}{\mathbf{T}} |
|
|
39 |
\newcommand{\x}{\mathbf{x}} |
|
|
40 |
\renewcommand{\a}{\mathbf{a}} |
|
|
41 |
\newcommand{\xn}{\mathbf{x}_{\text{new}}} |
|
|
42 |
\newcommand{\y}{\mathbf{y}} |
|
|
43 |
\newcommand{\w}{\mathbf{w}} |
|
|
44 |
\newcommand{\ws}{\mathbf{w}_\cdot} |
|
|
45 |
\renewcommand{\t}{\mathbf{t}} |
|
|
46 |
\newcommand{\M}{\mathbf{M}} |
|
|
47 |
\renewcommand{\vec}{\text{vec}} |
|
|
48 |
\newcommand{\B}{\mathbf{B}} |
|
|
49 |
\newcommand{\K}{\mathbf{K}} |
|
|
50 |
\newcommand{\W}{\mathbf{W}} |
|
|
51 |
\newcommand{\D}{\mathbf{D}} |
|
|
52 |
\newcommand{\I}{\mathbf{I}} |
|
|
53 |
\newcommand{\bS}{\mathbf{S}} |
|
|
54 |
\newcommand{\cellx}{\pi_n[\x]} |
|
|
55 |
\newcommand{\partn}{\pi_n(\mathcal{L}_n)} |
|
|
56 |
\newcommand{\err}{\text{Err}} |
|
|
57 |
\newcommand{\ea}{\widehat{\text{Err}}^{(a)}} |
|
|
58 |
\newcommand{\ecv}{\widehat{\text{Err}}^{(cv1)}} |
|
|
59 |
\newcommand{\ecvten}{\widehat{\text{Err}}^{(cv10)}} |
|
|
60 |
\newcommand{\eone}{\widehat{\text{Err}}^{(1)}} |
|
|
61 |
\newcommand{\eplus}{\widehat{\text{Err}}^{(.632+)}} |
|
|
62 |
\newcommand{\eoob}{\widehat{\text{Err}}^{(oob)}} |
|
|
63 |
\newcommand{\bft}{\mathbf{t}} |
|
|
64 |
|
|
|
65 |
\hyphenation{Qua-dra-tic} |
|
|
66 |
|
|
|
67 |
\title{\pkg{party}: A Laboratory for Recursive Partytioning} |
|
|
68 |
\Plaintitle{party: A Laboratory for Recursive Partytioning} |
|
|
69 |
|
|
|
70 |
\author{Torsten Hothorn\\Ludwig-Maximilians-\\Universit\"at M\"unchen |
|
|
71 |
\And Kurt Hornik\\Wirtschaftsuniversit\"at Wien |
|
|
72 |
\And Achim Zeileis\\Wirtschaftsuniversit\"at Wien} |
|
|
73 |
\Plainauthor{Achim Zeileis, Torsten Hothorn, Kurt Hornik} |
|
|
74 |
|
|
|
75 |
\Abstract{ |
|
|
76 |
The \pkg{party} package \citep{Hothorn+Hornik+Zeileis:2006a} aims |
|
|
77 |
at providing a recursive part(y)itioning laboratory assembling |
|
|
78 |
various high- and low-level tools for building tree-based regression |
|
|
79 |
and classification models. This includes conditional inference trees |
|
|
80 |
(\code{ctree}), conditional inference forests (\code{cforest}) and |
|
|
81 |
parametric model trees (\code{mob}). At the core of the package is |
|
|
82 |
\code{ctree}, an implementation of conditional inference trees which |
|
|
83 |
embed tree-structured regression models into a well defined theory of |
|
|
84 |
conditional inference procedures. This non-parametric class of regression |
|
|
85 |
trees is applicable to all kinds of regression problems, including |
|
|
86 |
nominal, ordinal, numeric, censored as well as multivariate response |
|
|
87 |
variables and arbitrary measurement scales of the covariates. |
|
|
88 |
This vignette comprises a practical guide to exploiting the flexible |
|
|
89 |
and extensible computational tools in \pkg{party} for fitting and |
|
|
90 |
visualizing conditional inference trees. |
|
|
91 |
} |
|
|
92 |
\Keywords{conditional inference, non-parametric models, recursive partitioning} |
|
|
93 |
|
|
|
94 |
\Address{ |
|
|
95 |
Torsten Hothorn\\ |
|
|
96 |
Institut f\"ur Statistik\\ |
|
|
97 |
Ludwig-Maximilians-Universit\"at M\"unchen\\ |
|
|
98 |
Ludwigstra{\ss}e 33\\ |
|
|
99 |
80539 M\"unchen, Germany\\ |
|
|
100 |
E-mail: \email{Torsten.Hothorn@R-project.org}\\ |
|
|
101 |
URL: \url{http://www.stat.uni-muenchen.de/~hothorn/}\\ |
|
|
102 |
|
|
|
103 |
Kurt Hornik, Achim Zeileis\\ |
|
|
104 |
Institute for Statistics and Mathematics\\ |
|
|
105 |
WU Wirtschaftsuniversit\"at Wien\\ |
|
|
106 |
Augasse 2--6\\ |
|
|
107 |
1090 Wien, Austria\\ |
|
|
108 |
E-mail: \email{Kurt.Hornik@R-project.org}, \email{Achim.Zeileis@R-project.org}\\ |
|
|
109 |
URL: \url{http://statmath.wu.ac.at/~hornik/},\\ |
|
|
110 |
\phantom{URL:} \url{http://statmath.wu.ac.at/~zeileis/} |
|
|
111 |
} |
|
|
112 |
|
|
|
113 |
|
|
|
114 |
\begin{document} |
|
|
115 |
|
|
|
116 |
<<setup, echo = FALSE, results = hide>>= |
|
|
117 |
options(width = 70, SweaveHooks = list(leftpar = |
|
|
118 |
function() par(mai = par("mai") * c(1, 1.1, 1, 1)))) |
|
|
119 |
require("party") |
|
|
120 |
require("coin") |
|
|
121 |
set.seed(290875) |
|
|
122 |
@ |
|
|
123 |
|
|
|
124 |
\setkeys{Gin}{width=\textwidth} |
|
|
125 |
|
|
|
126 |
|
|
|
127 |
\section{Introduction} |
|
|
128 |
|
|
|
129 |
The majority of recursive partitioning algorithms |
|
|
130 |
are special cases of a simple two-stage algorithm: |
|
|
131 |
First partition the observations by univariate splits in a recursive way and |
|
|
132 |
second fit a constant model in each cell of the resulting partition. |
|
|
133 |
The most popular implementations of such algorithms are `CART' |
|
|
134 |
\citep{classifica:1984} and `C4.5' \citep{Quinlan1993}. Not unlike AID, |
|
|
135 |
both perform an exhaustive search over all possible splits maximizing an |
|
|
136 |
information measure of node impurity selecting the |
|
|
137 |
covariate showing the best split. |
|
|
138 |
This approach has two fundamental problems: overfitting and a selection |
|
|
139 |
bias towards covariates with many possible splits. |
|
|
140 |
With respect to the |
|
|
141 |
overfitting problem \cite{Mingers1987} notes that the algorithm |
|
|
142 |
\begin{quote} |
|
|
143 |
[\ldots] has no concept of statistical significance, and so cannot |
|
|
144 |
distinguish between a significant and an insignificant improvement in the |
|
|
145 |
information measure. |
|
|
146 |
\end{quote} |
|
|
147 |
With the \pkg{party} package \citep[see][for a full description of its |
|
|
148 |
methodological foundations]{Hothorn+Hornik+Zeileis:2006a} |
|
|
149 |
we enter at the point where \cite{WhiteLiu1994} demand for |
|
|
150 |
\begin{quote} |
|
|
151 |
[\ldots] a \textit{statistical} approach [to recursive partitioning] which |
|
|
152 |
takes |
|
|
153 |
into account the \textit{distributional} properties of the measures. |
|
|
154 |
\end{quote} |
|
|
155 |
We present a unified framework embedding recursive binary partitioning into |
|
|
156 |
the well defined theory of permutation tests developed by |
|
|
157 |
\cite{StrasserWeber1999}. |
|
|
158 |
The conditional distribution of statistics measuring the association between |
|
|
159 |
responses and covariates is the basis for an unbiased selection among |
|
|
160 |
covariates measured at different scales. |
|
|
161 |
Moreover, multiple test procedures are applied to determine whether no |
|
|
162 |
significant |
|
|
163 |
association between any of the covariates and the response can be stated and |
|
|
164 |
the recursion needs to stop. |
|
|
165 |
|
|
|
166 |
\section{Recursive binary partitioning} \label{algo} |
|
|
167 |
|
|
|
168 |
We focus on regression models describing the conditional distribution of a |
|
|
169 |
response variable $\Y$ given the status of $m$ covariates by |
|
|
170 |
means of tree-structured recursive partitioning. The response $\Y$ from some |
|
|
171 |
sample space $\sY$ may be multivariate as well. |
|
|
172 |
The $m$-dimensional covariate vector $\X = (X_1, \dots, X_m)$ is taken |
|
|
173 |
from a sample space $\sX = \sX_1 \times \cdots \times \sX_m$. |
|
|
174 |
Both response variable and covariates may be measured |
|
|
175 |
at arbitrary scales. |
|
|
176 |
We assume that the conditional distribution $D(\Y | \X)$ of the response |
|
|
177 |
$\Y$ given the covariates $\X$ depends on a function $f$ of the covariates |
|
|
178 |
\begin{eqnarray*} |
|
|
179 |
D(\Y | \X) = D(\Y | X_1, \dots, X_m) = D(\Y | f( X_1, \dots, |
|
|
180 |
X_m)), |
|
|
181 |
\end{eqnarray*} |
|
|
182 |
where we restrict ourselves to partition based regression relationships, |
|
|
183 |
i.e., $r$ disjoint cells $B_1, \dots, B_r$ partitioning the covariate space $\sX |
|
|
184 |
= \bigcup_{k = 1}^r B_k$. |
|
|
185 |
A model of the regression relationship is to be fitted based on a learning |
|
|
186 |
sample $\LS$, i.e., a random sample of $n$ independent and |
|
|
187 |
identically distributed observations, possibly with some covariates $X_{ji}$ |
|
|
188 |
missing, |
|
|
189 |
\begin{eqnarray*} |
|
|
190 |
\LS & = & \{ (\Y_i, X_{1i}, \dots, X_{mi}); i = 1, \dots, n \}. |
|
|
191 |
\end{eqnarray*} |
|
|
192 |
For the sake of simplicity, we use a learning sample |
|
|
193 |
<<party-data, echo = TRUE>>= |
|
|
194 |
ls <- data.frame(y = gl(3, 50, labels = c("A", "B", "C")), x1 = rnorm(150) + rep(c(1, 0, 0), c(50, 50, |
|
|
195 |
50)), x2 = runif(150)) |
|
|
196 |
@ |
|
|
197 |
in the following illustrations. |
|
|
198 |
A generic algorithm for recursive binary partitioning for a given learning |
|
|
199 |
sample $\LS$ can be formulated using non-negative integer valued case weights $\w |
|
|
200 |
= (w_1, \dots, w_n)$. Each node of a tree is represented by a vector of |
|
|
201 |
case weights having non-zero elements when the corresponding observations |
|
|
202 |
are elements of the node and are zero otherwise. The following algorithm |
|
|
203 |
implements recursive binary partitioning: |
|
|
204 |
\begin{enumerate} |
|
|
205 |
\item For case weights $\w$ test the global null hypothesis of independence between |
|
|
206 |
any of the $m$ covariates and the response. Stop if this |
|
|
207 |
hypothesis cannot be rejected. |
|
|
208 |
Otherwise select the covariate $X_{j^*}$ with strongest |
|
|
209 |
association to $\Y$. |
|
|
210 |
\item Choose a set $A^* \subset \sX_{j^*}$ in order to split $\sX_{j^*}$ into |
|
|
211 |
two disjoint sets $A^*$ and $\sX_{j^*} \setminus A^*$. |
|
|
212 |
The case weights $\w_\text{left}$ and $\w_\text{right}$ determine the |
|
|
213 |
two subgroups with $w_{\text{left},i} = w_i I(X_{j^*i} \in A^*)$ and |
|
|
214 |
$w_{\text{right},i} = w_i I(X_{j^*i} \not\in A^*)$ for all $i = 1, |
|
|
215 |
\dots, n$ ($I(\cdot)$ denotes the indicator function). |
|
|
216 |
\item Recursively repeat steps 1 and 2 with modified case weights |
|
|
217 |
$\w_\text{left}$ and $\w_\text{right}$, respectively. |
|
|
218 |
\end{enumerate} |
|
|
219 |
The separation of variable |
|
|
220 |
selection and splitting procedure into steps 1 and 2 of the algorithm |
|
|
221 |
is the key for the construction of interpretable tree |
|
|
222 |
structures not suffering a systematic tendency towards covariates with many |
|
|
223 |
possible splits or many missing values. In addition, a statistically |
|
|
224 |
motivated and intuitive stopping criterion can be implemented: We stop |
|
|
225 |
when the global null hypothesis of independence between the |
|
|
226 |
response and any of the $m$ covariates cannot be rejected at a pre-specified |
|
|
227 |
nominal level~$\alpha$. The algorithm induces a partition $\{B_1, \dots, B_r\}$ of |
|
|
228 |
the covariate space $\sX$, where each cell $B \in \{B_1, \dots, B_r\}$ |
|
|
229 |
is associated with a vector of case weights. |
|
|
230 |
|
|
|
231 |
In package \pkg{party}, the dependency structure and the variables may be |
|
|
232 |
specified in a traditional formula based way |
|
|
233 |
<<party-formula, echo = TRUE, results = hide>>= |
|
|
234 |
library("party") |
|
|
235 |
ctree(y ~ x1 + x2, data = ls) |
|
|
236 |
@ |
|
|
237 |
Case counts $\w$ may be specified using the \texttt{weights} argument. |
|
|
238 |
|
|
|
239 |
\section{Recursive partitioning by conditional inference} \label{framework} |
|
|
240 |
|
|
|
241 |
In the main part of this section we focus on step 1 of the generic algorithm. |
|
|
242 |
Unified tests for independence are constructed by means of the conditional |
|
|
243 |
distribution of linear statistics in the permutation test framework |
|
|
244 |
developed by \cite{StrasserWeber1999}. The determination of the best binary split |
|
|
245 |
in one selected covariate and the handling of missing values |
|
|
246 |
is performed based on standardized linear statistics within the same |
|
|
247 |
framework as well. |
|
|
248 |
|
|
|
249 |
\subsection{Variable selection and stopping criteria} |
|
|
250 |
|
|
|
251 |
At step 1 of the generic algorithm given in Section~\ref{algo} we face an |
|
|
252 |
independence problem. We need to decide whether there is any information |
|
|
253 |
about the response variable covered by any of the $m$ covariates. In each node |
|
|
254 |
identified by case weights $\w$, the |
|
|
255 |
global hypothesis of independence is formulated in terms of the $m$ partial hypotheses |
|
|
256 |
$H_0^j: D(\Y | X_j) = D(\Y)$ with global null hypothesis $H_0 = \bigcap_{j = 1}^m |
|
|
257 |
H_0^j$. |
|
|
258 |
When we are not able to reject $H_0$ at a pre-specified |
|
|
259 |
level $\alpha$, we stop the recursion. |
|
|
260 |
If the global hypothesis can be rejected, we measure the association |
|
|
261 |
between $\Y$ and each of the covariates $X_j, j = 1, \dots, m$, by |
|
|
262 |
test statistics or $P$-values indicating the deviation from the partial |
|
|
263 |
hypotheses $H_0^j$. |
|
|
264 |
|
|
|
265 |
For notational convenience and without loss of generality we assume that the |
|
|
266 |
case weights $w_i$ are either zero or one. The symmetric group of all |
|
|
267 |
permutations of the elements of $(1, \dots, n)$ with corresponding case |
|
|
268 |
weights $w_i = 1$ is denoted by $S(\LS, \w)$. A more general notation is |
|
|
269 |
given in the Appendix. We measure the association between $\Y$ and $X_j, j = 1, \dots, m$, |
|
|
270 |
by linear statistics of the form |
|
|
271 |
\begin{eqnarray} \label{linstat} |
|
|
272 |
\T_j(\LS, \w) = \vec \left( \sum_{i=1}^n w_i g_j(X_{ji}) |
|
|
273 |
h(\Y_i, (\Y_1, \dots, \Y_n))^\top \right) \in \R^{p_jq} |
|
|
274 |
\end{eqnarray} |
|
|
275 |
where $g_j: \sX_j \rightarrow \R^{p_j}$ is a non-random transformation of |
|
|
276 |
the covariate $X_j$. The transformation may be specified using the |
|
|
277 |
\texttt{xtrafo} argument. If, for example, a ranking \textit{both} |
|
|
278 |
\texttt{x1} and \texttt{x2} is required, |
|
|
279 |
<<party-xtrafo, echo = TRUE, results = hide>>= |
|
|
280 |
ctree(y ~ x1 + x2, data = ls, xtrafo = function(data) trafo(data, |
|
|
281 |
numeric_trafo = rank)) |
|
|
282 |
@ |
|
|
283 |
can be used. The \emph{influence function} |
|
|
284 |
$h: \sY \times \sY^n \rightarrow |
|
|
285 |
\R^q$ depends on the responses $(\Y_1, \dots, \Y_n)$ in a permutation |
|
|
286 |
symmetric way. |
|
|
287 |
%%, i.e., $h(\Y_i, (\Y_1, \dots, \Y_n)) = h(\Y_i, (\Y_{\sigma(1)}, \dots, |
|
|
288 |
%%\Y_{\sigma(n)}))$ for all permutations $\sigma \in S(\LS, \w)$. |
|
|
289 |
Section~\ref{examples} explains how to choose $g_j$ and $h$ in different |
|
|
290 |
practical settings. A $p_j \times q$ matrix is converted into a |
|
|
291 |
$p_jq$ column vector by column-wise combination using the `vec' operator. |
|
|
292 |
The influence function can be specified using the \texttt{ytrafo} argument. |
|
|
293 |
|
|
|
294 |
The distribution of $\T_j(\LS, \w)$ under $H_0^j$ depends on the joint |
|
|
295 |
distribution of $\Y$ and $X_j$, which is unknown under almost all practical |
|
|
296 |
circumstances. At least under the null hypothesis one can dispose of this |
|
|
297 |
dependency by fixing the covariates and conditioning on all possible |
|
|
298 |
permutations of the responses. This principle leads to test procedures known |
|
|
299 |
as \textit{permutation tests}. |
|
|
300 |
The conditional expectation $\mu_j \in \R^{p_jq}$ and covariance $\Sigma_j |
|
|
301 |
\in \R^{p_jq \times p_jq}$ of $\T_j(\LS, \w)$ under $H_0$ given |
|
|
302 |
all permutations $\sigma \in S(\LS, \w)$ of the responses are derived by |
|
|
303 |
\cite{StrasserWeber1999}: |
|
|
304 |
\begin{eqnarray} |
|
|
305 |
\mu_j & = & \E(\T_j(\LS, \w) | S(\LS, \w)) = |
|
|
306 |
\vec \left( \left( \sum_{i = 1}^n w_i g_j(X_{ji}) \right) \E(h | S(\LS, |
|
|
307 |
\w))^\top |
|
|
308 |
\right), \nonumber \\ |
|
|
309 |
\Sigma_j & = & \V(\T_j(\LS, \w) | S(\LS, \w)) \nonumber \\ |
|
|
310 |
& = & |
|
|
311 |
\frac{\ws}{\ws - 1} \V(h | S(\LS, \w)) \otimes |
|
|
312 |
\left(\sum_i w_i g_j(X_{ji}) \otimes w_i g_j(X_{ji})^\top \right) \label{expectcovar} |
|
|
313 |
\\ |
|
|
314 |
& - & \frac{1}{\ws - 1} \V(h | S(\LS, \w)) \otimes \left( |
|
|
315 |
\sum_i w_i g_j(X_{ji}) \right) |
|
|
316 |
\otimes \left( \sum_i w_i g_j(X_{ji})\right)^\top |
|
|
317 |
\nonumber |
|
|
318 |
\end{eqnarray} |
|
|
319 |
where $\ws = \sum_{i = 1}^n w_i$ denotes the sum of the case weights, |
|
|
320 |
$\otimes$ is the Kronecker product and the conditional expectation of the |
|
|
321 |
influence function is |
|
|
322 |
\begin{eqnarray*} |
|
|
323 |
\E(h | S(\LS, \w)) = \ws^{-1} \sum_i w_i h(\Y_i, (\Y_1, \dots, \Y_n)) \in |
|
|
324 |
\R^q |
|
|
325 |
\end{eqnarray*} |
|
|
326 |
with corresponding $q \times q$ covariance matrix |
|
|
327 |
\begin{eqnarray*} |
|
|
328 |
\V(h | S(\LS, \w)) = \ws^{-1} \sum_i w_i \left(h(\Y_i, (\Y_1, \dots, \Y_n)) - \E(h | S(\LS, \w)) |
|
|
329 |
\right) \\ |
|
|
330 |
\left(h(\Y_i, (\Y_1, \dots, \Y_n)) - \E(h | S(\LS, \w))\right)^\top. |
|
|
331 |
\end{eqnarray*} |
|
|
332 |
Having the conditional expectation and covariance at hand we are able to |
|
|
333 |
standardize a linear statistic $\T \in \R^{pq}$ of the form |
|
|
334 |
(\ref{linstat}) for some $p \in \{p_1, \dots, p_m\}$. |
|
|
335 |
Univariate test statistics~$c$ mapping an observed multivariate |
|
|
336 |
linear statistic $\bft \in |
|
|
337 |
\R^{pq}$ into the real line can be of arbitrary form. An obvious choice is |
|
|
338 |
the maximum of the absolute values of the standardized linear statistic |
|
|
339 |
\begin{eqnarray*} |
|
|
340 |
c_\text{max}(\bft, \mu, \Sigma) = \max_{k = 1, \dots, pq} \left| \frac{(\bft - |
|
|
341 |
\mu)_k}{\sqrt{(\Sigma)_{kk}}} \right| |
|
|
342 |
\end{eqnarray*} |
|
|
343 |
utilizing the conditional expectation $\mu$ and covariance matrix |
|
|
344 |
$\Sigma$. The application of a quadratic form $c_\text{quad}(\bft, \mu, \Sigma) = |
|
|
345 |
(\bft - \mu) \Sigma^+ (\bft - \mu)^\top$ is one alternative, although |
|
|
346 |
computationally more expensive because the Moore-Penrose |
|
|
347 |
inverse $\Sigma^+$ of $\Sigma$ is involved. |
|
|
348 |
|
|
|
349 |
The type of test statistic to be used can be specified by means of the |
|
|
350 |
\texttt{ctree\_control} function, for example |
|
|
351 |
\begin{Schunk} |
|
|
352 |
\begin{Sinput} |
|
|
353 |
R> ctree(y ~ x1 + x2, data = ls, |
|
|
354 |
control = ctree_control(teststat = "max")) |
|
|
355 |
\end{Sinput} |
|
|
356 |
\end{Schunk} |
|
|
357 |
uses $c_\text{max}$ and |
|
|
358 |
\begin{Schunk} |
|
|
359 |
\begin{Sinput} |
|
|
360 |
R> ctree(y ~ x1 + x2, data = ls, |
|
|
361 |
control = ctree_control(teststat = "quad")) |
|
|
362 |
\end{Sinput} |
|
|
363 |
\end{Schunk} |
|
|
364 |
takes $c_\text{quad}$ (the default). |
|
|
365 |
|
|
|
366 |
It is important to note that the test statistics $c(\bft_j, \mu_j, \Sigma_j), |
|
|
367 |
j = 1, \dots, m$, cannot be directly compared in an unbiased |
|
|
368 |
way unless all of the covariates are measured at the same scale, i.e., |
|
|
369 |
$p_1 = p_j, j = 2, \dots, m$. |
|
|
370 |
In order to allow for an unbiased variable selection we need to switch to |
|
|
371 |
the $P$-value scale because $P$-values for the conditional distribution of test |
|
|
372 |
statistics $c(\T_j(\LS, \w), \mu_j, \Sigma_j)$ |
|
|
373 |
can be directly compared among covariates |
|
|
374 |
measured at different scales. In step 1 of the generic algorithm we |
|
|
375 |
select the covariate with minimum $P$-value, i.e., the covariate $X_{j^*}$ |
|
|
376 |
with $j^* = \argmin_{j = 1, \dots, m} P_j$, where |
|
|
377 |
\begin{displaymath} |
|
|
378 |
P_j = \Prob_{H_0^j}(c(\T_j(\LS, \w), \mu_j, |
|
|
379 |
\Sigma_j) \ge c(\bft_j, \mu_j, \Sigma_j) | S(\LS, \w)) |
|
|
380 |
\end{displaymath} |
|
|
381 |
denotes the $P$-value of the conditional test for $H_0^j$. |
|
|
382 |
So far, we have only addressed testing each partial hypothesis $H_0^j$, which |
|
|
383 |
is sufficient for an unbiased variable selection. A global test |
|
|
384 |
for $H_0$ required in step 1 can be constructed via an aggregation of the |
|
|
385 |
transformations $g_j, j = 1, \dots, m$, |
|
|
386 |
i.e., using a linear statistic of the form |
|
|
387 |
\begin{eqnarray*} |
|
|
388 |
\T(\LS, \w) = \vec \left( \sum_{i=1}^n w_i |
|
|
389 |
\left(g_1(X_{1i})^\top, \dots, |
|
|
390 |
g_m(X_{mi})^\top\right)^\top h(\Y_i, (\Y_1, \dots, \Y_n))^\top \right). |
|
|
391 |
%%\in \R^{\sum_j p_jq} |
|
|
392 |
\end{eqnarray*} |
|
|
393 |
However, this approach is less attractive for learning samples with |
|
|
394 |
missing values. Universally applicable approaches are multiple test |
|
|
395 |
procedures based on $P_1, \dots, P_m$. |
|
|
396 |
Simple Bonferroni-adjusted $P$-values ($mP_j$ prior to version 0.8-4, now $1 |
|
|
397 |
- (1 - P_j)^m$ is used), available via |
|
|
398 |
<<party-Bonf, echo = TRUE, results = hide>>= |
|
|
399 |
ctree_control(testtype = "Bonferroni") |
|
|
400 |
@ |
|
|
401 |
or a min-$P$-value resampling approach |
|
|
402 |
<<party-MC, echo = TRUE, results = hide>>= |
|
|
403 |
ctree_control(testtype = "MonteCarlo") |
|
|
404 |
@ |
|
|
405 |
are just examples and we refer |
|
|
406 |
to the multiple testing literature \citep[e.g.,][]{WestfallYoung1993} |
|
|
407 |
for more advanced methods. We reject $H_0$ when the minimum |
|
|
408 |
of the adjusted $P$-values is less than a pre-specified nominal level $\alpha$ |
|
|
409 |
and otherwise stop the algorithm. In this sense, $\alpha$ |
|
|
410 |
may be seen as a unique parameter determining the size of the resulting trees. |
|
|
411 |
|
|
|
412 |
\subsection{Splitting criteria} |
|
|
413 |
|
|
|
414 |
Once we have selected a covariate in step 1 of the algorithm, the split itself can be |
|
|
415 |
established by any split criterion, including those established by |
|
|
416 |
\cite{classifica:1984} or \cite{Shih1999}. Instead of simple binary splits, |
|
|
417 |
multiway splits can be implemented as well, for example utilizing |
|
|
418 |
the work of \cite{OBrien2004}. However, most splitting criteria are not |
|
|
419 |
applicable to response variables measured at arbitrary scales and we |
|
|
420 |
therefore utilize the permutation test framework described above |
|
|
421 |
to find the optimal binary split in one selected |
|
|
422 |
covariate $X_{j^*}$ in step~2 of the generic algorithm. The goodness of a split is |
|
|
423 |
evaluated by two-sample linear statistics which are special cases of the linear |
|
|
424 |
statistic (\ref{linstat}). |
|
|
425 |
For all possible subsets $A$ of the sample space $\sX_{j^*}$ the linear |
|
|
426 |
statistic |
|
|
427 |
\begin{eqnarray*} |
|
|
428 |
\T^A_{j^*}(\LS, \w) = \vec \left( \sum_{i=1}^n w_i I(X_{j^*i} \in A) |
|
|
429 |
h(\Y_i, (\Y_1, \dots, \Y_n))^\top \right) \in \R^{q} |
|
|
430 |
\end{eqnarray*} |
|
|
431 |
induces a two-sample statistic measuring the discrepancy between the samples |
|
|
432 |
$\{ \Y_i | w_i > 0 \text{ and } X_{ji} \in A; i = 1, \dots, n\}$ |
|
|
433 |
and $\{ \Y_i | w_i > 0 \text{ and } X_{ji} \not\in A; i = 1, \dots, n\}$. |
|
|
434 |
The conditional expectation $\mu_{j^*}^A$ and covariance |
|
|
435 |
$\Sigma_{j^*}^A$ |
|
|
436 |
can be computed by (\ref{expectcovar}). |
|
|
437 |
The split $A^*$ with a test statistic maximized over all possible |
|
|
438 |
subsets $A$ is established: |
|
|
439 |
\begin{eqnarray} \label{split} |
|
|
440 |
A^* = \argmax_A c(\bft_{j^*}^A, \mu_{j^*}^A, \Sigma_{j^*}^A). |
|
|
441 |
\end{eqnarray} |
|
|
442 |
The statistics $c(\bft_{j^*}^A, \mu_{j^*}^A, \Sigma_{j^*}^A)$ are available |
|
|
443 |
for each node with |
|
|
444 |
<<party-ss, echo = TRUE, results = hide>>= |
|
|
445 |
ctree_control(savesplitstats = TRUE) |
|
|
446 |
@ |
|
|
447 |
and can be used to depict a scatter plot of the covariate |
|
|
448 |
$\sX_{j^*}$ against the statistics. |
|
|
449 |
|
|
|
450 |
Note that we do not need to compute the distribution of |
|
|
451 |
$c(\bft_{j^*}^A, \mu_{j^*}^A, \Sigma_{j^*}^A)$ in step~2. |
|
|
452 |
In order to anticipate pathological splits one can restrict the number of |
|
|
453 |
possible subsets that are evaluated, for example by introducing restrictions |
|
|
454 |
on the sample size or the sum of the case weights in each of the two groups |
|
|
455 |
of observations induced by a possible split. For example, |
|
|
456 |
<<party-minsplit, echo = TRUE, results = hide>>= |
|
|
457 |
ctree_control(minsplit = 20) |
|
|
458 |
@ |
|
|
459 |
requires the sum of the weights in both the left and right daughter |
|
|
460 |
node to exceed the value of $20$. |
|
|
461 |
|
|
|
462 |
\subsection{Missing values and surrogate splits} |
|
|
463 |
|
|
|
464 |
If an observation $X_{ji}$ in covariate $X_j$ is missing, we set the |
|
|
465 |
corresponding case weight $w_i$ to zero for the computation of $\T_j(\LS, \w)$ |
|
|
466 |
and, if we would like to split in $X_j$, in $\T_{j}^A(\LS, \w)$ as well. |
|
|
467 |
Once a split $A^*$ in $X_j$ has been implemented, surrogate splits can be |
|
|
468 |
established by |
|
|
469 |
searching for a split leading to roughly the same division of the |
|
|
470 |
observations as the original split. One simply replaces the original |
|
|
471 |
response variable by a binary variable $I(X_{ji} \in A^*)$ coding the |
|
|
472 |
split and proceeds as described in the previous part. The number of |
|
|
473 |
surrogate splits can be controlled using |
|
|
474 |
<<party-maxsurr, echo = TRUE, results = hide>>= |
|
|
475 |
ctree_control(maxsurrogate = 3) |
|
|
476 |
@ |
|
|
477 |
|
|
|
478 |
\subsection{Inspecting a tree} |
|
|
479 |
|
|
|
480 |
Once we have fitted a conditional tree via |
|
|
481 |
<<party-fitted, echo = TRUE>>= |
|
|
482 |
ct <- ctree(y ~ x1 + x2, data = ls) |
|
|
483 |
@ |
|
|
484 |
we can inspect the results via a \texttt{print} method |
|
|
485 |
<<party-print, echo = TRUE>>= |
|
|
486 |
ct |
|
|
487 |
@ |
|
|
488 |
or by looking at a graphical representation as in Figure~\ref{party-plot1}. |
|
|
489 |
|
|
|
490 |
\begin{figure}[h] |
|
|
491 |
\begin{center} |
|
|
492 |
<<party-plot, echo = TRUE, fig = TRUE, height = 5, width = 8>>= |
|
|
493 |
plot(ct) |
|
|
494 |
@ |
|
|
495 |
\caption{A graphical representation of a classification tree. |
|
|
496 |
\label{party-plot1}} |
|
|
497 |
\end{center} |
|
|
498 |
\end{figure} |
|
|
499 |
|
|
|
500 |
Each node can be extracted by its node number, i.e., the root node is |
|
|
501 |
<<party-nodes, echo = TRUE>>= |
|
|
502 |
nodes(ct, 1) |
|
|
503 |
@ |
|
|
504 |
This object is a conventional list with elements |
|
|
505 |
<<party-nodelist, echo = TRUE>>= |
|
|
506 |
names(nodes(ct, 1)[[1]]) |
|
|
507 |
@ |
|
|
508 |
and we refer to the manual pages for a description of those elements. |
|
|
509 |
The \texttt{Predict} function aims at computing predictions in the space of |
|
|
510 |
the response variable, in our case a factor |
|
|
511 |
<<party-Predict, echo = TRUE>>= |
|
|
512 |
Predict(ct, newdata = ls) |
|
|
513 |
@ |
|
|
514 |
When we are interested in properties of the conditional distribution of the |
|
|
515 |
response given the covariates, we use |
|
|
516 |
<<party-treeresponse, echo = TRUE>>= |
|
|
517 |
treeresponse(ct, newdata = ls[c(1, 51, 101),]) |
|
|
518 |
@ |
|
|
519 |
which, in our case, is a list with conditional class probabilities. |
|
|
520 |
We can determine the node numbers of nodes some new observations are falling |
|
|
521 |
into by |
|
|
522 |
<<party-where, echo = TRUE>>= |
|
|
523 |
where(ct, newdata = ls[c(1,51,101),]) |
|
|
524 |
@ |
|
|
525 |
|
|
|
526 |
\section{Examples} \label{examples} |
|
|
527 |
|
|
|
528 |
\subsection{Univariate continuous or discrete regression} |
|
|
529 |
|
|
|
530 |
For a univariate numeric response $\Y \in \R$, the most natural influence function |
|
|
531 |
is the identity $h(\Y_i, (\Y_1, \dots, \Y_n)) = \Y_i$. |
|
|
532 |
In case some observations with extremely large or small values have been |
|
|
533 |
observed, a |
|
|
534 |
ranking of the observations may be appropriate: |
|
|
535 |
$h(\Y_i, (\Y_1, \dots, \Y_n)) = \sum_{k=1}^n w_k I(\Y_k \le \Y_i)$ for $i = 1, \dots, |
|
|
536 |
n$. |
|
|
537 |
Numeric covariates can be handled by the identity transformation |
|
|
538 |
$g_{ji}(x) = x$ (ranks are possible, too). Nominal covariates at levels $1, |
|
|
539 |
\dots, K$ are |
|
|
540 |
represented by $g_{ji}(k) = e_K(k)$, the unit vector of length $K$ with |
|
|
541 |
$k$th element being equal to one. Due to this flexibility, special test |
|
|
542 |
procedures like the Spearman test, the |
|
|
543 |
Wilcoxon-Mann-Whitney test or the Kruskal-Wallis test and permutation tests |
|
|
544 |
based on ANOVA statistics or correlation coefficients |
|
|
545 |
are covered by this framework. Splits obtained from (\ref{split}) maximize the |
|
|
546 |
absolute value of the standardized difference between two means of the |
|
|
547 |
values of the influence functions. |
|
|
548 |
For prediction, one is usually interested in an estimate of |
|
|
549 |
the expectation of |
|
|
550 |
the response $\E(\Y | \X = \x)$ in each cell, an estimate can be |
|
|
551 |
obtained by |
|
|
552 |
\begin{eqnarray*} |
|
|
553 |
\hat{\E}(\Y | \X = \x) = \left(\sum_{i=1}^n w_i(\x)\right)^{-1} \sum_{i=1}^n |
|
|
554 |
w_i(\x) \Y_i. |
|
|
555 |
\end{eqnarray*} |
|
|
556 |
|
|
|
557 |
\subsection{Censored regression} |
|
|
558 |
|
|
|
559 |
The influence function $h$ may be chosen as |
|
|
560 |
Logrank or Savage scores taking censoring into |
|
|
561 |
account and one can proceed as for univariate continuous regression. This is |
|
|
562 |
essentially the approach first published by \cite{regression:1988}. |
|
|
563 |
An alternative is the weighting scheme suggested by |
|
|
564 |
\cite{MolinaroDudiotvdLaan2003}. A weighted Kaplan-Meier curve for the case |
|
|
565 |
weights $\w(\x)$ can serve as prediction. |
|
|
566 |
|
|
|
567 |
\subsection{$J$-class classification} |
|
|
568 |
|
|
|
569 |
The nominal response variable at levels $1, \dots, J$ |
|
|
570 |
is handled by influence |
|
|
571 |
functions\linebreak $h(\Y_i, (\Y_1, \dots, \Y_n)) = e_J(\Y_i)$. Note that for a |
|
|
572 |
nominal covariate $X_j$ at levels $1, \dots, K$ with |
|
|
573 |
$g_{ji}(k) = e_K(k)$ the |
|
|
574 |
corresponding linear statistic $\T_j$ is a vectorized contingency table. |
|
|
575 |
The conditional class probabilities can be estimated via |
|
|
576 |
\begin{eqnarray*} |
|
|
577 |
\hat{\Prob}(\Y = y | \X = \x) = \left(\sum_{i=1}^n w_i(\x)\right)^{-1} |
|
|
578 |
\sum_{i=1}^n |
|
|
579 |
w_i(\x) I(\Y_i = y), \quad y = 1, \dots, J. |
|
|
580 |
\end{eqnarray*} |
|
|
581 |
|
|
|
582 |
\subsection{Ordinal regression} |
|
|
583 |
|
|
|
584 |
Ordinal response variables measured at $J$ levels, and ordinal covariates |
|
|
585 |
measured at $K$ levels, are associated with score vectors $\xi \in |
|
|
586 |
\R^J$ and $\gamma \in \R^K$, respectively. Those scores reflect the |
|
|
587 |
`distances' between the levels: If the variable is derived from an |
|
|
588 |
underlying continuous variable, the scores can be chosen as the midpoints |
|
|
589 |
of the intervals defining the levels. The linear statistic is now a linear |
|
|
590 |
combination of the linear statistic $\T_j$ of the form |
|
|
591 |
\begin{eqnarray*} |
|
|
592 |
\M \T_j(\LS, \w) & = & \vec \left( \sum_{i=1}^n w_i \gamma^\top g_j(X_{ji}) |
|
|
593 |
\left(\xi^\top h(\Y_i, (\Y_1, \dots, \Y_n)\right)^\top \right) |
|
|
594 |
\end{eqnarray*} |
|
|
595 |
with $g_j(x) = e_K(x)$ and $h(\Y_i, (\Y_1, \dots, \Y_n)) = e_J(\Y_i)$. |
|
|
596 |
If both response and covariate are ordinal, the matrix of coefficients |
|
|
597 |
is given by the Kronecker product of both score vectors $\M = \xi \otimes \gamma \in |
|
|
598 |
\R^{1, KJ}$. In case the response is ordinal only, the matrix of |
|
|
599 |
coefficients $\M$ is a block matrix |
|
|
600 |
\begin{eqnarray*} |
|
|
601 |
\M = |
|
|
602 |
\left( |
|
|
603 |
\begin{array}{ccc} |
|
|
604 |
\xi_1 & & 0 \\ |
|
|
605 |
& \ddots & \\ |
|
|
606 |
0 & & \xi_1 |
|
|
607 |
\end{array} \right| |
|
|
608 |
%%\left. |
|
|
609 |
%% \begin{array}{ccc} |
|
|
610 |
%% \xi_2 & & 0 \\ |
|
|
611 |
%% & \ddots & \\ |
|
|
612 |
%% 0 & & \xi_2 |
|
|
613 |
%% \end{array} \right| |
|
|
614 |
%%\left. |
|
|
615 |
\begin{array}{c} |
|
|
616 |
\\ |
|
|
617 |
\hdots \\ |
|
|
618 |
\\ |
|
|
619 |
\end{array} %%\right. |
|
|
620 |
\left| |
|
|
621 |
\begin{array}{ccc} |
|
|
622 |
\xi_q & & 0 \\ |
|
|
623 |
& \ddots & \\ |
|
|
624 |
0 & & \xi_q |
|
|
625 |
\end{array} |
|
|
626 |
\right) |
|
|
627 |
%%\in \R^{K, KJ} |
|
|
628 |
%%\end{eqnarray*} |
|
|
629 |
\text{ or } %%and if one covariate is ordered |
|
|
630 |
%%\begin{eqnarray*} |
|
|
631 |
%%\M = \left( |
|
|
632 |
%% \begin{array}{cccc} |
|
|
633 |
%% \gamma & 0 & & 0 \\ |
|
|
634 |
%% 0 & \gamma & & \vdots \\ |
|
|
635 |
%% 0 & 0 & \hdots & 0 \\ |
|
|
636 |
%% \vdots & \vdots & & 0 \\ |
|
|
637 |
%% 0 & 0 & & \gamma |
|
|
638 |
%% \end{array} |
|
|
639 |
%% \right) |
|
|
640 |
\M = \text{diag}(\gamma) |
|
|
641 |
%%\in \R^{J, KJ} |
|
|
642 |
\end{eqnarray*} |
|
|
643 |
when one covariate is ordered but the response is not. For both $\Y$ and $X_j$ |
|
|
644 |
being ordinal, the corresponding test is known as linear-by-linear association |
|
|
645 |
test \citep{Agresti2002}. Scores can be supplied to \texttt{ctree} using the |
|
|
646 |
\texttt{scores} argument, see Section~\ref{illustrations} for an example. |
|
|
647 |
|
|
|
648 |
\subsection{Multivariate regression} |
|
|
649 |
|
|
|
650 |
For multivariate responses, the influence function is a combination of |
|
|
651 |
influence functions appropriate for any of the univariate response variables |
|
|
652 |
discussed in the previous paragraphs, e.g., indicators for multiple binary |
|
|
653 |
responses \citep{Zhang1998,NohSongPark2004}, Logrank or Savage scores |
|
|
654 |
for multiple failure times |
|
|
655 |
%%\citep[for example tooth loss times, ][]{SuFan2004} |
|
|
656 |
and the original observations or a rank transformation for multivariate regression |
|
|
657 |
\citep{Death2002}. |
|
|
658 |
|
|
|
659 |
\section{Illustrations and applications} \label{illustrations} |
|
|
660 |
|
|
|
661 |
In this section, we present regression problems which illustrate the |
|
|
662 |
potential fields of application of the methodology. |
|
|
663 |
Conditional inference trees based on $c_\text{quad}$-type test statistics |
|
|
664 |
using the identity influence function for numeric responses |
|
|
665 |
and asymptotic $\chi^2$ distribution are applied. |
|
|
666 |
For the stopping criterion a simple |
|
|
667 |
Bonferroni correction is used and we follow the usual convention by choosing |
|
|
668 |
the nominal level of the conditional independence tests as $\alpha = 0.05$. |
|
|
669 |
|
|
|
670 |
\subsection{Tree pipit abundance} |
|
|
671 |
|
|
|
672 |
<<treepipit-ctree, echo = TRUE>>= |
|
|
673 |
data("treepipit", package = "coin") |
|
|
674 |
tptree <- ctree(counts ~ ., data = treepipit) |
|
|
675 |
@ |
|
|
676 |
|
|
|
677 |
\begin{figure}[t] |
|
|
678 |
\begin{center} |
|
|
679 |
<<treepipit-plot, echo = TRUE, fig = TRUE, height = 5, width = 8>>= |
|
|
680 |
plot(tptree, terminal_panel = node_hist(tptree, breaks = 0:6-0.5, ymax = 65, |
|
|
681 |
horizontal = FALSE, freq = TRUE)) |
|
|
682 |
@ |
|
|
683 |
\caption{Conditional regression tree for the tree pipit data.} |
|
|
684 |
\end{center} |
|
|
685 |
\end{figure} |
|
|
686 |
|
|
|
687 |
<<treepipit-x, echo = FALSE, results = hide>>= |
|
|
688 |
x <- tptree@tree |
|
|
689 |
@ |
|
|
690 |
|
|
|
691 |
The impact of certain environmental factors on the population density of the |
|
|
692 |
tree pipit \textit{Anthus trivialis} |
|
|
693 |
%%in Frankonian oak forests |
|
|
694 |
is investigated by \cite{MuellerHothorn2004}. |
|
|
695 |
The occurrence of tree pipits was recorded several times at |
|
|
696 |
$n = 86$ stands which were established on a long environmental gradient. |
|
|
697 |
Among nine environmental factors, |
|
|
698 |
the covariate showing the largest association to the number of tree pipits |
|
|
699 |
is the canopy overstorey $(P = $\Sexpr{round(1 - x$criterion[[3]], 3)}$)$. |
|
|
700 |
Two groups of stands can be distinguished: Sunny stands with less than $40\%$ |
|
|
701 |
canopy overstorey $(n = \Sexpr{sum(x$left$weights)})$ show a |
|
|
702 |
significantly higher density of tree pipits compared to darker stands with more than |
|
|
703 |
$40\%$ canopy overstorey $(n = \Sexpr{sum(x$right$weights)})$. |
|
|
704 |
This result is important for management decisions |
|
|
705 |
in forestry enterprises: Cutting the overstorey with release of old |
|
|
706 |
oaks creates a perfect habitat for this indicator species |
|
|
707 |
of near natural forest environments. |
|
|
708 |
|
|
|
709 |
\subsection{Glaucoma and laser scanning images} |
|
|
710 |
|
|
|
711 |
<<glaucoma-ctree, echo = TRUE>>= |
|
|
712 |
data("GlaucomaM", package = "TH.data") |
|
|
713 |
gtree <- ctree(Class ~ ., data = GlaucomaM) |
|
|
714 |
@ |
|
|
715 |
|
|
|
716 |
<<glaucoma-x, echo = FALSE, results = hide>>= |
|
|
717 |
x <- gtree@tree |
|
|
718 |
@ |
|
|
719 |
|
|
|
720 |
Laser scanning images taken from the eye background are expected to serve as |
|
|
721 |
the basis of an automated system for glaucoma diagnosis. |
|
|
722 |
Although prediction is more important in this application |
|
|
723 |
\citep{new-glauco:2003}, a simple |
|
|
724 |
visualization of the regression relationship is useful for comparing the |
|
|
725 |
structures inherent in the learning sample with subject matter knowledge. |
|
|
726 |
For $98$ patients and $98$ controls, matched by age |
|
|
727 |
and gender, $62$ covariates describing the eye morphology are available. The |
|
|
728 |
data is part of the |
|
|
729 |
\pkg{TH.data} package, \url{http://CRAN.R-project.org}). |
|
|
730 |
The first split in Figure~\ref{glaucoma} separates eyes with a volume |
|
|
731 |
above reference less than |
|
|
732 |
$\Sexpr{x$psplit$splitpoint} \text{ mm}^3$ in the inferior part of the |
|
|
733 |
optic nerve head (\texttt{vari}). |
|
|
734 |
Observations with larger volume are mostly controls, a finding which |
|
|
735 |
corresponds to subject matter knowledge: The volume above reference measures the |
|
|
736 |
thickness of the nerve layer, expected to decrease with a glaucomatous |
|
|
737 |
damage of the optic nerve. Further separation is achieved by the volume |
|
|
738 |
above surface global (\texttt{vasg}) and the volume above reference in the |
|
|
739 |
temporal part of the optic nerve head (\texttt{vart}). |
|
|
740 |
|
|
|
741 |
\setkeys{Gin}{width=.9\textwidth} |
|
|
742 |
\begin{figure}[p] |
|
|
743 |
\begin{center} |
|
|
744 |
<<glaucoma-plot, echo = FALSE, fig = TRUE, height = 6, width = 10>>= |
|
|
745 |
plot(gtree) |
|
|
746 |
@ |
|
|
747 |
\caption{Conditional inference tree for the glaucoma data. For each inner node, the |
|
|
748 |
Bonferroni-adjusted $P$-values are given, the fraction of glaucomatous eyes |
|
|
749 |
is displayed for each terminal node. (Note: node 7 was not splitted prior to |
|
|
750 |
version 0.8-4 because of using another formulation of the |
|
|
751 |
Bonferroni-adjustment.) \label{glaucoma}} |
|
|
752 |
|
|
|
753 |
<<glaucoma-plot-inner, echo = FALSE, fig = TRUE, height = 7, width = 10>>= |
|
|
754 |
plot(gtree, inner_panel = node_barplot, |
|
|
755 |
edge_panel = function(...) invisible(), tnex = 1) |
|
|
756 |
@ |
|
|
757 |
\caption{Conditional inference tree for the glaucoma data with the |
|
|
758 |
fraction of glaucomatous eyes displayed for both inner and terminal |
|
|
759 |
nodes. \label{glaucoma-inner}} |
|
|
760 |
\end{center} |
|
|
761 |
\end{figure} |
|
|
762 |
|
|
|
763 |
The plot in Figure~\ref{glaucoma} is generated by |
|
|
764 |
|
|
|
765 |
<<glaucoma-plot2, echo = TRUE, eval = FALSE>>= |
|
|
766 |
plot(gtree) |
|
|
767 |
@ |
|
|
768 |
\setkeys{Gin}{width=\textwidth} |
|
|
769 |
|
|
|
770 |
and shows the distribution of the classes in |
|
|
771 |
the terminal nodes. This distribution can be shown for the inner nodes as |
|
|
772 |
well, namely by specifying the appropriate panel generating function |
|
|
773 |
(\texttt{node\_barplot} in our case), see Figure~\ref{glaucoma-inner}. |
|
|
774 |
|
|
|
775 |
<<glaucoma-plot-inner, echo = TRUE, eval = FALSE>>= |
|
|
776 |
plot(gtree, inner_panel = node_barplot, |
|
|
777 |
edge_panel = function(...) invisible(), tnex = 1) |
|
|
778 |
@ |
|
|
779 |
|
|
|
780 |
As mentioned in Section~\ref{framework}, it might be interesting to have a |
|
|
781 |
look at the split statistics the split point estimate was derived from. |
|
|
782 |
Those statistics can be extracted from the \texttt{splitstatistic} element |
|
|
783 |
of a split and one can easily produce scatterplots against the selected |
|
|
784 |
covariate. For all three inner nodes of \texttt{gtree}, we produce such a |
|
|
785 |
plot in Figure~\ref{glaucoma-split}. For the root node, the estimated split point |
|
|
786 |
seems very natural, since the process of split statistics seems to have a |
|
|
787 |
clear maximum indicating that the simple split point model is something |
|
|
788 |
reasonable to assume here. This is less obvious for nodes $2$ and, |
|
|
789 |
especially, $3$. |
|
|
790 |
|
|
|
791 |
\begin{figure}[t] |
|
|
792 |
\begin{center} |
|
|
793 |
<<glaucoma-split-plot, echo = TRUE, fig = TRUE, height = 4, width = 11, leftpar = TRUE>>= |
|
|
794 |
cex <- 1.6 |
|
|
795 |
inner <- nodes(gtree, c(1, 2, 5)) |
|
|
796 |
layout(matrix(1:length(inner), ncol = length(inner))) |
|
|
797 |
out <- sapply(inner, function(i) { |
|
|
798 |
splitstat <- i$psplit$splitstatistic |
|
|
799 |
x <- GlaucomaM[[i$psplit$variableName]][splitstat > 0] |
|
|
800 |
plot(x, splitstat[splitstat > 0], main = paste("Node", i$nodeID), |
|
|
801 |
xlab = i$psplit$variableName, ylab = "Statistic", ylim = c(0, 10), |
|
|
802 |
cex.axis = cex, cex.lab = cex, cex.main = cex) |
|
|
803 |
abline(v = i$psplit$splitpoint, lty = 3) |
|
|
804 |
}) |
|
|
805 |
@ |
|
|
806 |
\caption{Split point estimation in each inner node. The process of |
|
|
807 |
the standardized two-sample test statistics for each possible |
|
|
808 |
split point in the selected input variable is show. |
|
|
809 |
The estimated split point is given as vertical dotted line. |
|
|
810 |
\label{glaucoma-split}} |
|
|
811 |
\end{center} |
|
|
812 |
\end{figure} |
|
|
813 |
|
|
|
814 |
The class predictions of the tree for the learning sample (and for new |
|
|
815 |
observations as well) can be computed using the \texttt{Predict} function. A |
|
|
816 |
comparison with the true class memberships is done by |
|
|
817 |
<<glaucoma-prediction, echo = TRUE>>= |
|
|
818 |
table(Predict(gtree), GlaucomaM$Class) |
|
|
819 |
@ |
|
|
820 |
When we are interested in conditional class probabilities, the |
|
|
821 |
\texttt{treeresponse} method must be used. A graphical representation is |
|
|
822 |
shown in Figure~\ref{glaucoma-probplot}. |
|
|
823 |
|
|
|
824 |
\setkeys{Gin}{width=.5\textwidth} |
|
|
825 |
\begin{figure}[ht!] |
|
|
826 |
\begin{center} |
|
|
827 |
<<glaucoma-classprob, echo = TRUE, fig = TRUE, height = 4, width = 5>>= |
|
|
828 |
prob <- sapply(treeresponse(gtree), function(x) x[1]) + |
|
|
829 |
runif(nrow(GlaucomaM), min = -0.01, max = 0.01) |
|
|
830 |
splitvar <- nodes(gtree, 1)[[1]]$psplit$variableName |
|
|
831 |
plot(GlaucomaM[[splitvar]], prob, |
|
|
832 |
pch = as.numeric(GlaucomaM$Class), ylab = "Conditional Class Prob.", |
|
|
833 |
xlab = splitvar) |
|
|
834 |
abline(v = nodes(gtree, 1)[[1]]$psplit$splitpoint, lty = 2) |
|
|
835 |
legend(0.15, 0.7, pch = 1:2, legend = levels(GlaucomaM$Class), bty = "n") |
|
|
836 |
@ |
|
|
837 |
\caption{Estimated conditional class probabilities (slightly jittered) |
|
|
838 |
for the Glaucoma data depending on the first split variable. |
|
|
839 |
The vertical line denotes the first split point. \label{glaucoma-probplot}} |
|
|
840 |
\end{center} |
|
|
841 |
\end{figure} |
|
|
842 |
|
|
|
843 |
\subsection{Node positive breast cancer} |
|
|
844 |
|
|
|
845 |
<<GBSGS-ctree, echo = TRUE>>= |
|
|
846 |
data("GBSG2", package = "TH.data") |
|
|
847 |
stree <- ctree(Surv(time, cens) ~ ., data = GBSG2) |
|
|
848 |
@ |
|
|
849 |
|
|
|
850 |
\setkeys{Gin}{width=\textwidth} |
|
|
851 |
\begin{figure}[t] |
|
|
852 |
\begin{center} |
|
|
853 |
<<GBSG2-plot, echo = TRUE, fig = TRUE, height = 6, width = 10>>= |
|
|
854 |
plot(stree) |
|
|
855 |
@ |
|
|
856 |
\caption{Tree-structured survival model for the GBSG2 data and the distribution of |
|
|
857 |
survival times in the terminal nodes. The median survival time is displayed in |
|
|
858 |
each terminal node of the tree. \label{gbsg2}} |
|
|
859 |
\end{center} |
|
|
860 |
\end{figure} |
|
|
861 |
|
|
|
862 |
|
|
|
863 |
Recursive partitioning for censored responses has attracted a lot of interest |
|
|
864 |
\citep[e.g.,][]{regression:1988,relative-r:1992}. |
|
|
865 |
Survival trees using $P$-value adjusted Logrank statistics |
|
|
866 |
are used by \cite{statistics:2001a} |
|
|
867 |
for the evaluation of prognostic factors for the German Breast |
|
|
868 |
Cancer Study Group (GBSG2) data, a prospective controlled clinical trial on the |
|
|
869 |
treatment of node positive breast cancer patients. Here, we use |
|
|
870 |
Logrank scores as well. Complete data of seven prognostic factors of $686$ |
|
|
871 |
women are used for prognostic modeling, the |
|
|
872 |
dataset is available within the |
|
|
873 |
\pkg{TH.data} package. |
|
|
874 |
The number of positive lymph nodes |
|
|
875 |
(\texttt{pnodes}) and the progesterone receptor (\texttt{progrec}) have |
|
|
876 |
been identified as prognostic factors in the survival tree analysis |
|
|
877 |
by \cite{statistics:2001a}. Here, the binary variable coding whether a |
|
|
878 |
hormonal therapy was applied or not (\texttt{horTh}) additionally is |
|
|
879 |
part of the model depicted in Figure~\ref{gbsg2}. |
|
|
880 |
|
|
|
881 |
The estimated median survival time for new patients is less informative |
|
|
882 |
compared to the whole Kaplan-Meier curve estimated from the patients in the |
|
|
883 |
learning sample for each terminal node. We can compute those `predictions' |
|
|
884 |
by means of the \texttt{treeresponse} method |
|
|
885 |
<<GBSG2-KM, echo = TRUE>>= |
|
|
886 |
treeresponse(stree, newdata = GBSG2[1:2,]) |
|
|
887 |
@ |
|
|
888 |
|
|
|
889 |
\subsection{Mammography experience} |
|
|
890 |
|
|
|
891 |
<<mammo-ctree, echo = TRUE>>= |
|
|
892 |
data("mammoexp", package = "TH.data") |
|
|
893 |
mtree <- ctree(ME ~ ., data = mammoexp) |
|
|
894 |
@ |
|
|
895 |
|
|
|
896 |
\begin{figure}[t] |
|
|
897 |
\begin{center} |
|
|
898 |
<<mammo-plot, echo = TRUE, fig = TRUE, height = 5, width = 8>>= |
|
|
899 |
plot(mtree) |
|
|
900 |
@ |
|
|
901 |
\caption{Ordinal regression for the mammography experience data with the |
|
|
902 |
fractions of (never, within a year, over one year) given in the nodes. |
|
|
903 |
No admissible split was found for node 4 because only $5$ of $91$ women reported |
|
|
904 |
a family history of breast cancer and the sample size restrictions would |
|
|
905 |
require more than $5$ observations in each daughter node. \label{mammoexp}} |
|
|
906 |
\end{center} |
|
|
907 |
\end{figure} |
|
|
908 |
|
|
|
909 |
|
|
|
910 |
Ordinal response variables are common in investigations where the response |
|
|
911 |
is a subjective human interpretation. |
|
|
912 |
We use an example given by \cite{HosmerLemeshow2000}, p. 264, |
|
|
913 |
studying the relationship between the mammography experience (never, |
|
|
914 |
within a year, over one year) and opinions about mammography expressed in |
|
|
915 |
questionnaires answered by $n = 412$ women. The resulting partition based on |
|
|
916 |
scores $\xi = (1,2,3)$ is given in Figure~\ref{mammoexp}. |
|
|
917 |
Women who (strongly) agree with the question `You do not need a mammogram unless |
|
|
918 |
you develop symptoms' seldomly have experienced a mammography. The variable |
|
|
919 |
\texttt{benefit} is a score with low values indicating a strong agreement with the |
|
|
920 |
benefits of the examination. For those women in (strong) disagreement with the first |
|
|
921 |
question above, low values of \texttt{benefit} identify persons being more likely to have |
|
|
922 |
experienced such an examination at all. |
|
|
923 |
|
|
|
924 |
\bibliography{partyrefs} |
|
|
925 |
|
|
|
926 |
\end{document} |
|
|
927 |
|
|
|
928 |
|
|
|
929 |
|
|
|
930 |
\subsection{Hunting spiders} |
|
|
931 |
|
|
|
932 |
Finally, we take a closer look at a challenging dataset on animal abundance |
|
|
933 |
first reported by \cite{AartSmeenk1975} and |
|
|
934 |
re-analyzed by \cite{Death2002} using regression trees dealing with |
|
|
935 |
multivariate responses. The |
|
|
936 |
abundance of $12$ hunting spider species is regressed on six |
|
|
937 |
environmental variables (\texttt{water}, \texttt{sand}, \texttt{moss}, |
|
|
938 |
\texttt{reft}, \texttt{twigs} and \texttt{herbs}) for $n = 28$ |
|
|
939 |
observations. Because of the small sample size we allow for a split if at |
|
|
940 |
least $5$ observations are element of a node and approximate the |
|
|
941 |
distribution of a $c_\text{max}$ test statistic by $9999$ Monte-Carlo |
|
|
942 |
replications. The prognostic factors \texttt{twigs} and \texttt{water} |
|
|
943 |
found by \cite{Death2002} are confirmed by the model |
|
|
944 |
shown in Figure~\ref{spider} which additionally identifies \texttt{reft}. |
|
|
945 |
The data are available in package |
|
|
946 |
\pkg{mvpart} \citep{mvpart2004} at \url{http://CRAN.R-project.org}. |
|
|
947 |
|
|
|
948 |
<<spider-ctree, echo = TRUE, eval = FALSE>>= |
|
|
949 |
data("spider", package = "mvpart") |
|
|
950 |
sptree <- ctree(arct.lute + pard.lugu + zora.spin + pard.nigr + pard.pull + |
|
|
951 |
aulo.albi + troc.terr + alop.cune + pard.mont + alop.acce + |
|
|
952 |
alop.fabr + arct.peri ~ herbs+reft+moss+sand+twigs+water, |
|
|
953 |
controls = ctree_control(teststat = "max", testtype = "MonteCarlo", nresample = 9999, |
|
|
954 |
minsplit = 5, mincriterion = 0.9), data = spider) |
|
|
955 |
sptree@tree$criterion |
|
|
956 |
|
|
|
957 |
library("coin") |
|
|
958 |
data("spider", package = "mvpart") |
|
|
959 |
it <- independence_test(arct.lute + pard.lugu + zora.spin + pard.nigr + pard.pull + |
|
|
960 |
aulo.albi + troc.terr + alop.cune + pard.mont + alop.acce + |
|
|
961 |
alop.fabr + arct.peri ~ herbs+reft+moss+sand+twigs+water, |
|
|
962 |
data = spider, distribution = approximate(B = 19999)) |
|
|
963 |
statistic(it, "standardized") |
|
|
964 |
pvalue(it) |
|
|
965 |
@ |
|
|
966 |
|
|
|
967 |
\begin{figure}[t] |
|
|
968 |
\begin{center} |
|
|
969 |
<<spider-plot, eval = FALSE, echo = TRUE, fig = TRUE>>= |
|
|
970 |
plot(sptree, terminal_panel = node_terminal) |
|
|
971 |
@ |
|
|
972 |
\caption{Regression tree for hunting spider abundance. The species with |
|
|
973 |
maximal abundance is given for each terminal node. \label{spider}} |
|
|
974 |
\end{center} |
|
|
975 |
\end{figure} |
|
|
976 |
|
|
|
977 |
|
|
|
978 |
|