|
a |
|
b/partyMod/src/Splits.c |
|
|
1 |
|
|
|
2 |
/** |
|
|
3 |
Binary splits |
|
|
4 |
*\file Splits.c |
|
|
5 |
*\author $Author$ |
|
|
6 |
*\date $Date$ |
|
|
7 |
*/ |
|
|
8 |
|
|
|
9 |
#include "party.h" |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
/** |
|
|
13 |
Search for a cutpoint in a ordered variable x maximizing a two-sample |
|
|
14 |
statistic w.r.t. (the influence function of ) the response variable y. |
|
|
15 |
*\param x raw numeric measurements |
|
|
16 |
*\param p dimension of the transformation |
|
|
17 |
*\param y values of the influence function |
|
|
18 |
*\param q dimension of the influence function |
|
|
19 |
*\param weights case weights |
|
|
20 |
*\param n number of observations |
|
|
21 |
*\param orderx the ordering of the transformations, i.e. R> order(x) |
|
|
22 |
*\param splitctrl an object of class `SplitControl' |
|
|
23 |
*\param linexpcov2sample an (uninitialized) object of class |
|
|
24 |
`LinStatExpectCovar' with p = 1 |
|
|
25 |
*\param expcovinf an initialized object of class `ExpectCovarInfluence' |
|
|
26 |
*\param cutpoint return value; pointer to a double for the cutpoint in x |
|
|
27 |
*\param maxstat return value; pointer to a double for the |
|
|
28 |
maximal test statistic |
|
|
29 |
*\param statistics return value; pointer to a n-dim double |
|
|
30 |
for the statistics |
|
|
31 |
*/ |
|
|
32 |
|
|
|
33 |
void C_split(const double *x, int p, |
|
|
34 |
const double *y, int q, |
|
|
35 |
const double *weights, int n, |
|
|
36 |
const int *orderx, |
|
|
37 |
SEXP splitctrl, SEXP linexpcov2sample, |
|
|
38 |
SEXP expcovinf, double *cutpoint, double *maxstat, |
|
|
39 |
double *statistics) { |
|
|
40 |
|
|
|
41 |
double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar, |
|
|
42 |
tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp; |
|
|
43 |
double minobs, maxobs, xmax; |
|
|
44 |
int lastj, i, j, k; |
|
|
45 |
|
|
|
46 |
if (p != 1) error("C_split: p not equal to one"); |
|
|
47 |
tol = get_tol(splitctrl); |
|
|
48 |
|
|
|
49 |
/* init statistics and determine the maximal value with positive weight |
|
|
50 |
since we can't choose this one as cutpoint |
|
|
51 |
*/ |
|
|
52 |
xmax = 0.0; |
|
|
53 |
for (i = 0; i < n; i++) { |
|
|
54 |
statistics[i] = 0.0; |
|
|
55 |
if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i]; |
|
|
56 |
} |
|
|
57 |
|
|
|
58 |
/* we already have expecation and covariance of the response |
|
|
59 |
* values and the sum of the weights */ |
|
|
60 |
dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym)); |
|
|
61 |
dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym)); |
|
|
62 |
sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0]; |
|
|
63 |
|
|
|
64 |
/* if there is something to split */ |
|
|
65 |
if (sweights > 1) { |
|
|
66 |
|
|
|
67 |
/* we need to ensure that at least minbucket weights |
|
|
68 |
are there to split (either left or right) */ |
|
|
69 |
minprob = get_minprob(splitctrl); |
|
|
70 |
minbucket = get_minbucket(splitctrl); |
|
|
71 |
minobs = sweights * minprob + 1.0; |
|
|
72 |
|
|
|
73 |
if (minobs < minbucket) |
|
|
74 |
minobs = minbucket; |
|
|
75 |
maxobs = sweights * (1 - minprob) - 1.0; |
|
|
76 |
if (maxobs > sweights - minbucket) |
|
|
77 |
maxobs = sweights - minbucket; |
|
|
78 |
|
|
|
79 |
f1 = (double) sweights / (sweights - 1); |
|
|
80 |
f2 = 1.0 / (sweights - 1); |
|
|
81 |
w = 0.0; |
|
|
82 |
|
|
|
83 |
/* pointers to the R-objects */ |
|
|
84 |
dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym)); |
|
|
85 |
for (k = 0; k < q; k++) dlinstat[k] = 0.0; |
|
|
86 |
dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym)); |
|
|
87 |
dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym)); |
|
|
88 |
|
|
|
89 |
tx = 0.0; |
|
|
90 |
lastj = 0; |
|
|
91 |
|
|
|
92 |
/* for all possible cutpoints (defined by the observations x) */ |
|
|
93 |
for (i = 0; i < (n - 1); i++) { |
|
|
94 |
|
|
|
95 |
/* the ordering of the ith observation */ |
|
|
96 |
j = orderx[i] - 1; |
|
|
97 |
|
|
|
98 |
/* if the corresponding weight is zero */ |
|
|
99 |
if (weights[j] == 0.0) continue; |
|
|
100 |
|
|
|
101 |
/* just a check: can be removed later */ |
|
|
102 |
if (w > 0 && x[j] < tx) |
|
|
103 |
warning("C_split: inconsistent ordering: %f < %f!\n", |
|
|
104 |
x[j], tx); |
|
|
105 |
|
|
|
106 |
/* handle ties: delete the entry of the last visited observation |
|
|
107 |
(take care of zero weights!) */ |
|
|
108 |
if (w > 0 && x[j] == tx) |
|
|
109 |
statistics[lastj] = 0.0; |
|
|
110 |
|
|
|
111 |
/* store the value and position of the j smallest observation */ |
|
|
112 |
tx = x[j]; |
|
|
113 |
lastj = j; |
|
|
114 |
|
|
|
115 |
w += weights[j]; |
|
|
116 |
|
|
|
117 |
/* do not consider those splits */ |
|
|
118 |
if (w > maxobs || x[j] >= xmax) break; |
|
|
119 |
|
|
|
120 |
/* compute the linear statistic and expectation and |
|
|
121 |
* covariance if needed */ |
|
|
122 |
for (k = 0; k < q; k++) |
|
|
123 |
dlinstat[k] += y[n * k + j] * weights[j]; |
|
|
124 |
|
|
|
125 |
if (w >= minobs) { |
|
|
126 |
for (k = 0; k < q; k++) |
|
|
127 |
dexpect[k] = w * dExp_y[k]; |
|
|
128 |
|
|
|
129 |
f1w = f1 * w; |
|
|
130 |
f2ww = f2 * w * w; |
|
|
131 |
for (k = 0; k < q*q; k++) |
|
|
132 |
dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k]; |
|
|
133 |
} else { |
|
|
134 |
continue; |
|
|
135 |
} |
|
|
136 |
|
|
|
137 |
/* the absolute standardized test statistic, to be maximized */ |
|
|
138 |
/* statistics[j] = C_maxabsTestStatistic(dlinstat, |
|
|
139 |
dexpect, dcovar, q, tol); */ |
|
|
140 |
|
|
|
141 |
/* much faster but uses maxabs always*/ |
|
|
142 |
statistics[j] = 0.0; |
|
|
143 |
for (k = 0; k < q; k++) { |
|
|
144 |
if (dcovar[k * q + k] <= tol) continue; |
|
|
145 |
tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]); |
|
|
146 |
if (statistics[j] < tmp) statistics[j] = tmp; |
|
|
147 |
} |
|
|
148 |
|
|
|
149 |
} |
|
|
150 |
|
|
|
151 |
/* search for the maximum and the best separating cutpoint */ |
|
|
152 |
/* <FIXME> the result might differ between 32 and 64bit systems |
|
|
153 |
because of rounding errors in 'statistics' */ |
|
|
154 |
maxstat[0] = 0.0; |
|
|
155 |
for (i = 0; i < n; i++) { |
|
|
156 |
if (statistics[i] > maxstat[0]) { |
|
|
157 |
maxstat[0] = statistics[i]; |
|
|
158 |
cutpoint[0] = x[i]; |
|
|
159 |
} |
|
|
160 |
} |
|
|
161 |
/* </FIXME> */ |
|
|
162 |
} |
|
|
163 |
} |
|
|
164 |
|
|
|
165 |
|
|
|
166 |
/** |
|
|
167 |
R-interface to C_split (does not handle ordered y's) |
|
|
168 |
*\param x values of the transformation |
|
|
169 |
*\param y values of the influence function |
|
|
170 |
*\param weights case weights |
|
|
171 |
*\param orderx the ordering of the transformations, i.e. R> order(x) |
|
|
172 |
*\param linexpcov2sample an (uninitialized) object of class |
|
|
173 |
`LinStatExpectCovar' with p = 1 |
|
|
174 |
*\param expcovinf an initialized object of class `ExpectCovarInfluence' |
|
|
175 |
*\param splitctrl an object of class `SplitControl' |
|
|
176 |
*/ |
|
|
177 |
|
|
|
178 |
SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample, |
|
|
179 |
SEXP expcovinf, SEXP splitctrl) { |
|
|
180 |
|
|
|
181 |
SEXP ans, cutpoint, maxstat, statistics; |
|
|
182 |
|
|
|
183 |
PROTECT(ans = allocVector(VECSXP, 3)); |
|
|
184 |
SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1)); |
|
|
185 |
SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1)); |
|
|
186 |
SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x))); |
|
|
187 |
|
|
|
188 |
C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x), |
|
|
189 |
INTEGER(orderx), splitctrl, linexpcov2sample, expcovinf, |
|
|
190 |
REAL(cutpoint), REAL(maxstat), REAL(statistics)); |
|
|
191 |
UNPROTECT(1); |
|
|
192 |
return(ans); |
|
|
193 |
} |
|
|
194 |
|
|
|
195 |
|
|
|
196 |
/** |
|
|
197 |
Search for a cutpoint in a unordered factor x maximizing a two-sample |
|
|
198 |
statistic w.r.t. (the influence function of ) the response variable y. |
|
|
199 |
*\param codingx the coding of x, i.e. as.numeric(x) |
|
|
200 |
*\param p dimension of the transformation |
|
|
201 |
*\param y values of the influence function |
|
|
202 |
*\param q dimension of the influence function |
|
|
203 |
*\param weights case weights |
|
|
204 |
*\param n number of observations |
|
|
205 |
*\param codingx the coding of x, i.e. as.numeric(x) |
|
|
206 |
*\param standstat the vector of the standardized statistics for x, y, |
|
|
207 |
weights |
|
|
208 |
*\param splitctrl an object of class `SplitControl' |
|
|
209 |
*\param linexpcov2sample an (uninitialized) object of class |
|
|
210 |
`LinStatExpectCovar' with p = 1 |
|
|
211 |
*\param expcovinf an initialized object of class `ExpectCovarInfluence' |
|
|
212 |
*\param cutpoint return value; pointer to a double for the cutpoint in x |
|
|
213 |
*\param levelset return value; pointer to a p-dim 0/1 integer |
|
|
214 |
*\param maxstat return value; pointer to a double for the |
|
|
215 |
maximal test statistic |
|
|
216 |
*\param statistics return value; pointer to a n-dim double for |
|
|
217 |
the statistics |
|
|
218 |
*/ |
|
|
219 |
|
|
|
220 |
void C_splitcategorical(const int *codingx, int p, |
|
|
221 |
const double *y, int q, |
|
|
222 |
const double *weights, int n, |
|
|
223 |
double *standstat, |
|
|
224 |
SEXP splitctrl, SEXP linexpcov2sample, |
|
|
225 |
SEXP expcovinf, double *cutpoint, int *levelset, |
|
|
226 |
double *maxstat, double *statistics) { |
|
|
227 |
|
|
|
228 |
double *tmpx, *tmptmpx, tmp = 0.0; |
|
|
229 |
int *irank, *ordertmpx, i, j, k, l, jp, chk; |
|
|
230 |
|
|
|
231 |
/* allocate memory */ |
|
|
232 |
tmpx = Calloc(n, double); |
|
|
233 |
ordertmpx = Calloc(n, int); |
|
|
234 |
irank = Calloc(p, int); |
|
|
235 |
tmptmpx = Calloc(n, double); |
|
|
236 |
|
|
|
237 |
/* for all response variables (aka: dummy variables) */ |
|
|
238 |
for (j = 0; j < q; j++) { |
|
|
239 |
|
|
|
240 |
jp = j * p; |
|
|
241 |
|
|
|
242 |
/* determine the ranking of the kth level among |
|
|
243 |
the standardized statistic: This induced an ordering of the |
|
|
244 |
observations */ |
|
|
245 |
for (k = 0; k < p; k++) { |
|
|
246 |
irank[k] = 1; |
|
|
247 |
for (l = 0; l < p; l++) |
|
|
248 |
if (standstat[jp + l] < standstat[jp + k]) irank[k]++; |
|
|
249 |
} |
|
|
250 |
|
|
|
251 |
/* a temporary response variable: the rank of the level */ |
|
|
252 |
for (i = 0; i < n; i++) { |
|
|
253 |
/* <FIXME> do we have to adjust weights for missing values here??? */ |
|
|
254 |
if (weights[i] == 0.0) { |
|
|
255 |
tmpx[i] = 0.0; |
|
|
256 |
} else { |
|
|
257 |
tmpx[i] = (double) irank[codingx[i] - 1]; |
|
|
258 |
} |
|
|
259 |
/* </FIXME> */ |
|
|
260 |
tmptmpx[i] = tmpx[i]; |
|
|
261 |
ordertmpx[i] = i + 1; |
|
|
262 |
} |
|
|
263 |
|
|
|
264 |
/* order(dtmpx) */ |
|
|
265 |
rsort_with_index(tmptmpx, ordertmpx, n); |
|
|
266 |
|
|
|
267 |
/* search for a cutpoint (now we do have an ordering) */ |
|
|
268 |
C_split(tmpx, 1, y, q, weights, n, ordertmpx, |
|
|
269 |
splitctrl, linexpcov2sample, |
|
|
270 |
expcovinf, cutpoint, maxstat, statistics); |
|
|
271 |
|
|
|
272 |
/* if we have seen an improvement: save this segmentation |
|
|
273 |
note: there may be splits with equal goodness */ |
|
|
274 |
chk = 0; |
|
|
275 |
if (maxstat[0] > tmp) { |
|
|
276 |
for (k = 0; k < p; k++) { |
|
|
277 |
if (irank[k] > cutpoint[0]) { |
|
|
278 |
levelset[k] = 1; |
|
|
279 |
chk += 1; |
|
|
280 |
} else { |
|
|
281 |
levelset[k] = 0; |
|
|
282 |
} |
|
|
283 |
} |
|
|
284 |
tmp = maxstat[0]; |
|
|
285 |
} |
|
|
286 |
/* <FIXME> make sure that at least one level goes left, |
|
|
287 |
C_split may end up with cutpoint > max(irank), why? |
|
|
288 |
</FIXME> |
|
|
289 |
*/ |
|
|
290 |
/* hm, why did I added |
|
|
291 |
if (chk == 0) tmp = 0.0; |
|
|
292 |
??? */ |
|
|
293 |
} |
|
|
294 |
maxstat[0] = tmp; |
|
|
295 |
|
|
|
296 |
/* free memory */ |
|
|
297 |
Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx); |
|
|
298 |
} |
|
|
299 |
|
|
|
300 |
|
|
|
301 |
/** |
|
|
302 |
R-interface to C_splitcategorical (does not handle ordered y's) |
|
|
303 |
*\param x the values of the x-transformation |
|
|
304 |
*\param codingx the coding of x, i.e. as.numeric(x) |
|
|
305 |
*\param y values of the influence function |
|
|
306 |
*\param weights case weights |
|
|
307 |
*\param linexpcov2sample an (uninitialized) object of class |
|
|
308 |
`LinStatExpectCovar' with p = 1 |
|
|
309 |
*\param linexpcov an initialized object of class `LinStatExpectCovar' |
|
|
310 |
*\param expcovinf an initialized object of class `ExpectCovarInfluence' |
|
|
311 |
*\param splitctrl an object of class `SplitControl' |
|
|
312 |
*/ |
|
|
313 |
|
|
|
314 |
SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights, |
|
|
315 |
SEXP linexpcov2sample, SEXP linexpcov, |
|
|
316 |
SEXP expcovinf, SEXP splitctrl) { |
|
|
317 |
|
|
|
318 |
SEXP ans, cutpoint, maxstat, statistics, levelset; |
|
|
319 |
double *standstat; |
|
|
320 |
|
|
|
321 |
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x), |
|
|
322 |
1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov); |
|
|
323 |
|
|
|
324 |
standstat = Calloc(get_dimension(linexpcov), double); |
|
|
325 |
C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)), |
|
|
326 |
REAL(GET_SLOT(linexpcov, PL2_expectationSym)), |
|
|
327 |
REAL(GET_SLOT(linexpcov, PL2_covarianceSym)), |
|
|
328 |
get_dimension(linexpcov), get_tol(splitctrl), standstat); |
|
|
329 |
|
|
|
330 |
PROTECT(ans = allocVector(VECSXP, 4)); |
|
|
331 |
SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1)); |
|
|
332 |
SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1)); |
|
|
333 |
SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x))); |
|
|
334 |
SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x))); |
|
|
335 |
|
|
|
336 |
C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights), |
|
|
337 |
nrow(x), standstat, |
|
|
338 |
splitctrl, linexpcov2sample, expcovinf, |
|
|
339 |
REAL(cutpoint), INTEGER(levelset), REAL(maxstat), |
|
|
340 |
REAL(statistics)); |
|
|
341 |
|
|
|
342 |
UNPROTECT(1); |
|
|
343 |
Free(standstat); |
|
|
344 |
return(ans); |
|
|
345 |
} |