a b/partyMod/src/Splits.c
1
2
/**
3
    Binary splits 
4
    *\file Splits.c
5
    *\author $Author$
6
    *\date $Date$
7
*/
8
                
9
#include "party.h"
10
11
12
/**
13
    Search for a cutpoint in a ordered variable x maximizing a two-sample
14
    statistic w.r.t. (the influence function of ) the response variable y.
15
    *\param x raw numeric measurements 
16
    *\param p dimension of the transformation
17
    *\param y values of the influence function
18
    *\param q dimension of the influence function
19
    *\param weights case weights
20
    *\param n number of observations
21
    *\param orderx the ordering of the transformations, i.e. R> order(x)
22
    *\param splitctrl an object of class `SplitControl'
23
    *\param linexpcov2sample an (uninitialized) object of class 
24
                             `LinStatExpectCovar' with p = 1
25
    *\param expcovinf an initialized object of class `ExpectCovarInfluence'
26
    *\param cutpoint return value; pointer to a double for the cutpoint in x
27
    *\param maxstat return value; pointer to a double for the 
28
                    maximal test statistic
29
    *\param statistics return value; pointer to a n-dim double 
30
                       for the statistics
31
*/
32
                                
33
void C_split(const double *x, int p,
34
             const double *y, int q,
35
             const double *weights, int n,
36
             const int *orderx,
37
             SEXP splitctrl, SEXP linexpcov2sample, 
38
             SEXP expcovinf, double *cutpoint, double *maxstat, 
39
             double *statistics) {
40
41
    double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar, 
42
           tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp;
43
    double minobs, maxobs, xmax;
44
    int lastj, i, j, k;
45
46
    if (p != 1) error("C_split: p not equal to one");
47
    tol = get_tol(splitctrl);
48
49
    /* init statistics and determine the maximal value with positive weight 
50
       since we can't choose this one as cutpoint
51
    */
52
    xmax = 0.0;
53
    for (i = 0; i < n; i++) {
54
        statistics[i] = 0.0;
55
        if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i];
56
    }
57
58
    /* we already have expecation and covariance of the response
59
     * values and the sum of the weights */
60
    dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym));
61
    dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym));
62
    sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0];
63
64
    /* if there is something to split */
65
    if (sweights > 1) {
66
67
        /* we need to ensure that at least minbucket weights 
68
           are there to split (either left or right) */
69
        minprob = get_minprob(splitctrl);
70
        minbucket = get_minbucket(splitctrl);
71
        minobs = sweights * minprob + 1.0;
72
73
        if (minobs < minbucket) 
74
            minobs = minbucket; 
75
        maxobs = sweights * (1 - minprob) - 1.0;
76
        if (maxobs > sweights - minbucket) 
77
            maxobs = sweights - minbucket; 
78
79
        f1 = (double) sweights / (sweights - 1);
80
        f2 = 1.0 / (sweights - 1);
81
        w = 0.0;
82
    
83
        /* pointers to the R-objects */
84
        dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym));
85
        for (k = 0; k < q; k++) dlinstat[k] = 0.0;
86
        dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym));
87
        dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym));
88
89
        tx = 0.0;
90
        lastj = 0;
91
92
        /* for all possible cutpoints (defined by the observations x) */
93
        for (i = 0; i < (n - 1); i++) {
94
    
95
            /* the ordering of the ith observation */
96
            j = orderx[i] - 1;
97
        
98
            /* if the corresponding weight is zero */
99
            if (weights[j] == 0.0) continue;
100
101
            /* just a check: can be removed later */
102
            if (w > 0 && x[j] < tx)
103
                warning("C_split: inconsistent ordering: %f < %f!\n", 
104
                        x[j], tx);
105
        
106
            /* handle ties: delete the entry of the last visited observation
107
               (take care of zero weights!) */
108
            if (w > 0 && x[j] == tx)
109
                statistics[lastj] = 0.0; 
110
111
            /* store the value and position of the j smallest observation */
112
            tx = x[j];
113
            lastj = j;
114
        
115
            w += weights[j];
116
117
            /* do not consider those splits */
118
            if (w > maxobs || x[j] >= xmax) break;
119
120
            /* compute the linear statistic and expectation and 
121
             * covariance if needed */
122
            for (k = 0; k < q; k++)
123
                dlinstat[k] += y[n * k + j] * weights[j];
124
 
125
            if (w >= minobs) {
126
                for (k = 0; k < q; k++)
127
                    dexpect[k] = w * dExp_y[k];
128
129
                f1w = f1 * w;
130
                f2ww = f2 * w * w;
131
                for (k = 0; k < q*q; k++)
132
                    dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k];
133
            } else {
134
                continue;
135
            }
136
        
137
            /* the absolute standardized test statistic, to be maximized */
138
            /* statistics[j] = C_maxabsTestStatistic(dlinstat, 
139
                   dexpect, dcovar, q, tol); */
140
141
            /* much faster but uses maxabs always*/
142
            statistics[j] = 0.0;
143
            for (k = 0; k < q; k++) {
144
                if (dcovar[k * q + k] <= tol) continue;
145
                tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]);
146
                if (statistics[j] < tmp) statistics[j] = tmp;
147
            }
148
149
        }
150
    
151
        /* search for the maximum and the best separating cutpoint */
152
        /* <FIXME> the result might differ between 32 and 64bit systems 
153
                   because of rounding errors in 'statistics' */
154
        maxstat[0] = 0.0;        
155
        for (i = 0; i < n; i++) {
156
            if (statistics[i] > maxstat[0]) {
157
                maxstat[0] = statistics[i];
158
                cutpoint[0] = x[i];
159
            }
160
        }
161
        /* </FIXME> */
162
    }
163
}
164
165
166
/**
167
    R-interface to C_split (does not handle ordered y's)
168
    *\param x values of the transformation
169
    *\param y values of the influence function
170
    *\param weights case weights
171
    *\param orderx the ordering of the transformations, i.e. R> order(x)
172
    *\param linexpcov2sample an (uninitialized) object of class 
173
                             `LinStatExpectCovar' with p = 1
174
    *\param expcovinf an initialized object of class `ExpectCovarInfluence'
175
    *\param splitctrl an object of class `SplitControl'
176
*/
177
178
SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample, 
179
             SEXP expcovinf, SEXP splitctrl) {
180
             
181
    SEXP ans, cutpoint, maxstat, statistics;
182
    
183
    PROTECT(ans = allocVector(VECSXP, 3));
184
    SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
185
    SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
186
    SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
187
    
188
    C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
189
            INTEGER(orderx), splitctrl, linexpcov2sample, expcovinf,
190
            REAL(cutpoint), REAL(maxstat), REAL(statistics));
191
    UNPROTECT(1);
192
    return(ans);
193
}
194
195
196
/**
197
    Search for a cutpoint in a unordered factor x maximizing a two-sample
198
    statistic w.r.t. (the influence function of ) the response variable y.
199
    *\param codingx the coding of x, i.e. as.numeric(x)
200
    *\param p dimension of the transformation
201
    *\param y values of the influence function
202
    *\param q dimension of the influence function
203
    *\param weights case weights
204
    *\param n number of observations
205
    *\param codingx the coding of x, i.e. as.numeric(x)
206
    *\param standstat the vector of the standardized statistics for x, y, 
207
                      weights 
208
    *\param splitctrl an object of class `SplitControl'
209
    *\param linexpcov2sample an (uninitialized) object of class 
210
                             `LinStatExpectCovar' with p = 1
211
    *\param expcovinf an initialized object of class `ExpectCovarInfluence'
212
    *\param cutpoint return value; pointer to a double for the cutpoint in x
213
    *\param levelset return value; pointer to a p-dim 0/1 integer
214
    *\param maxstat return value; pointer to a double for the 
215
                    maximal test statistic
216
    *\param statistics return value; pointer to a n-dim double for 
217
                       the statistics
218
*/
219
220
void C_splitcategorical(const int *codingx, int p,
221
                        const double *y, int q,
222
                        const double *weights, int n,
223
                        double *standstat,
224
                        SEXP splitctrl, SEXP linexpcov2sample, 
225
                        SEXP expcovinf, double *cutpoint, int *levelset, 
226
                        double *maxstat, double *statistics) {
227
228
    double *tmpx, *tmptmpx, tmp = 0.0;
229
    int *irank, *ordertmpx, i, j, k, l, jp, chk;
230
231
    /* allocate memory */
232
    tmpx = Calloc(n, double);
233
    ordertmpx = Calloc(n, int);
234
    irank = Calloc(p, int);
235
    tmptmpx = Calloc(n, double);
236
237
    /* for all response variables (aka: dummy variables) */
238
    for (j = 0; j < q; j++) {
239
    
240
        jp = j * p;
241
242
        /* determine the ranking of the kth level among 
243
           the standardized statistic: This induced an ordering of the 
244
           observations */
245
        for (k = 0; k < p; k++) {
246
            irank[k] = 1;
247
            for (l = 0; l < p; l++)
248
                if (standstat[jp + l] < standstat[jp + k]) irank[k]++;
249
        }
250
        
251
        /* a temporary response variable: the rank of the level */
252
        for (i = 0; i < n; i++) {
253
            /* <FIXME> do we have to adjust weights for missing values here??? */
254
            if (weights[i] == 0.0) {
255
                tmpx[i] = 0.0;
256
            } else {
257
                tmpx[i] = (double) irank[codingx[i] - 1];
258
            }
259
            /* </FIXME> */
260
            tmptmpx[i] = tmpx[i];
261
            ordertmpx[i] = i + 1;
262
        }
263
        
264
        /* order(dtmpx) */
265
        rsort_with_index(tmptmpx, ordertmpx, n);
266
267
        /* search for a cutpoint (now we do have an ordering) */
268
        C_split(tmpx, 1, y, q, weights, n, ordertmpx,
269
                splitctrl, linexpcov2sample,
270
                expcovinf, cutpoint, maxstat, statistics);
271
272
        /* if we have seen an improvement: save this segmentation 
273
           note: there may be splits with equal goodness */
274
        chk = 0;
275
        if (maxstat[0] > tmp) {
276
            for (k = 0; k < p; k++) {
277
                if (irank[k] > cutpoint[0]) {
278
                    levelset[k] = 1;
279
                    chk += 1;
280
                } else {
281
                    levelset[k] = 0;
282
                }
283
            }
284
            tmp = maxstat[0];
285
        }
286
        /* <FIXME> make sure that at least one level goes left,
287
                   C_split may end up with cutpoint > max(irank), why?
288
           </FIXME>
289
        */
290
        /* hm, why did I added 
291
        if (chk == 0) tmp = 0.0; 
292
        ??? */
293
    }
294
    maxstat[0] = tmp;
295
296
    /* free memory */
297
    Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx);
298
}
299
300
301
/**
302
    R-interface to C_splitcategorical (does not handle ordered y's)
303
    *\param x the values of the x-transformation
304
    *\param codingx the coding of x, i.e. as.numeric(x)
305
    *\param y values of the influence function
306
    *\param weights case weights
307
    *\param linexpcov2sample an (uninitialized) object of class 
308
                             `LinStatExpectCovar' with p = 1
309
    *\param linexpcov an initialized object of class `LinStatExpectCovar'
310
    *\param expcovinf an initialized object of class `ExpectCovarInfluence'
311
    *\param splitctrl an object of class `SplitControl'
312
*/
313
314
SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights, 
315
                        SEXP linexpcov2sample, SEXP linexpcov, 
316
                        SEXP expcovinf, SEXP splitctrl) {
317
             
318
    SEXP ans, cutpoint, maxstat, statistics, levelset;
319
    double *standstat;
320
321
    C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
322
                    1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
323
324
    standstat = Calloc(get_dimension(linexpcov), double);
325
    C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)),
326
                  REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
327
                  REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
328
                  get_dimension(linexpcov), get_tol(splitctrl), standstat);
329
330
    PROTECT(ans = allocVector(VECSXP, 4));
331
    SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
332
    SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
333
    SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
334
    SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x)));
335
    
336
    C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights), 
337
                       nrow(x), standstat, 
338
                       splitctrl, linexpcov2sample, expcovinf, 
339
                       REAL(cutpoint), INTEGER(levelset), REAL(maxstat), 
340
                       REAL(statistics));
341
342
    UNPROTECT(1);
343
    Free(standstat);
344
    return(ans);
345
}