Switch to unified view

a b/partyMod/src/IndependenceTest.c
1
2
/**
3
    Functions for variable selection in each node of a tree
4
    *\file IndependenceTest.c
5
    *\author $Author$
6
    *\date $Date$
7
*/
8
                
9
#include "party.h"
10
11
12
/**
13
    Computes the test statistic and, if requested, the corresponding 
14
    P-value for a linear statistic \n
15
    *\param linexpcov an object of class `LinStatExpectCovar'
16
    *\param varctrl an object of class `VariableControl'
17
    *\param ans_teststat; return value, the test statistic
18
    *\param ans_pvalue; return value, the p-value
19
*/
20
        
21
void C_TeststatPvalue(const SEXP linexpcov, const SEXP varctrl, 
22
                      double *ans_teststat, double *ans_pvalue) {
23
    
24
    double releps, abseps, tol;
25
    int maxpts;
26
    
27
    maxpts = get_maxpts(varctrl);
28
    tol = get_tol(varctrl);
29
    abseps = get_abseps(varctrl);
30
    releps = get_releps(varctrl);
31
    
32
    /* compute the test statistic */
33
    ans_teststat[0] = C_TestStatistic(linexpcov, get_teststat(varctrl), 
34
                                  get_tol(varctrl));
35
36
    /* compute the p-value if requested */                                  
37
    if (get_pvalue(varctrl))
38
        ans_pvalue[0] =  C_ConditionalPvalue(ans_teststat[0], linexpcov, 
39
                                         get_teststat(varctrl),
40
                                         tol, &maxpts, &releps, &abseps);
41
    else
42
        ans_pvalue[0] = 1.0;
43
}
44
45
/**
46
    Computes the test statistic and the node criterion \n
47
    *\param linexpcov an object of class `LinStatExpectCovar'
48
    *\param varctrl an object of class `VariableControl'
49
    *\param ans_teststat; return value, the test statistic
50
    *\param ans_criterion; return value, thep-value
51
*/
52
        
53
void C_TeststatCriterion(const SEXP linexpcov, const SEXP varctrl, 
54
                         double *ans_teststat, double *ans_criterion) {
55
    
56
    C_TeststatPvalue(linexpcov, varctrl, ans_teststat, ans_criterion);
57
    
58
    /* the node criterion is to be MAXIMISED, 
59
       i.e. 1-pvalue or test statistic \in \[0, \infty\] */
60
    if (get_pvalue(varctrl))
61
        ans_criterion[0] = 1 - ans_criterion[0];
62
    else
63
        ans_criterion[0] = ans_teststat[0];
64
    
65
}
66
67
68
/**
69
    Test of independence between x and y \n
70
    *\param x values of the transformation
71
    *\param y values of the influence function
72
    *\param weights case weights
73
    *\param linexpcov an object of class `VariableControl' for T
74
    *\param varctrl an object of class `VariableControl'
75
    *\param ans; return value, a double vector (teststat, pvalue)
76
*/
77
78
void C_IndependenceTest(const SEXP x, const SEXP y, const SEXP weights, 
79
                        SEXP linexpcov, SEXP varctrl, 
80
                        SEXP ans) {
81
    
82
    /* compute linear statistic and its conditional expectation and
83
       covariance
84
    */
85
    C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), 
86
                    REAL(weights), nrow(x), 1, 
87
                    GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
88
89
    /* compute test statistic */
90
    if (get_teststat(varctrl) == 2) 
91
        C_LinStatExpCovMPinv(linexpcov, get_tol(varctrl));
92
    C_TeststatPvalue(linexpcov, varctrl, &REAL(ans)[0], &REAL(ans)[1]);
93
}
94
95
96
/**
97
    R-interface to C_IndependenceTest \n
98
    *\param x values of the transformation
99
    *\param y values of the influence function
100
    *\param weights case weights
101
    *\param linexpcov an object of class `VariableControl' for T
102
    *\param varctrl an object of class `VariableControl'
103
*/
104
105
SEXP R_IndependenceTest(SEXP x, SEXP y, SEXP weights, SEXP linexpcov, SEXP varctrl) {
106
                        
107
    SEXP ans;
108
    
109
    PROTECT(ans = allocVector(REALSXP, 2));
110
    C_IndependenceTest(x, y, weights, linexpcov, varctrl, ans);
111
    UNPROTECT(1);
112
    return(ans);
113
}
114
115
116
/**
117
    Perform a global test on independence of a response and multiple inputs \n
118
    *\param learnsample an object of class `LearningSample'
119
    *\param weights case weights
120
    *\param fitmem an object of class `TreeFitMemory'
121
    *\param varctrl an object of class `VariableControl'
122
    *\param gtctrl an object of class `GlobalTestControl'
123
    *\param minsplit minimum sum of weights to proceed
124
    *\param ans_teststat return value; vector of test statistics
125
    *\param ans_criterion return value; vector of node criteria 
126
            (adjusted) pvalues or raw test statistics
127
    *\param depth an integer giving the depth of the current node
128
*/
129
130
void C_GlobalTest(const SEXP learnsample, const SEXP weights, 
131
                  SEXP fitmem, const SEXP varctrl, 
132
                  const SEXP gtctrl, const double minsplit, 
133
                  double *ans_teststat, double *ans_criterion, int depth) {
134
135
    int ninputs, nobs, j, i, k, RECALC = 1, type;
136
    SEXP responses, inputs, y, x, xmem, expcovinf;
137
    SEXP Smtry;
138
    double *thisweights, *pvaltmp, stweights = 0.0;
139
    int RANDOM, mtry, *randomvar, *index;
140
    int *dontuse, *dontusetmp;
141
    
142
    ninputs = get_ninputs(learnsample);
143
    nobs = get_nobs(learnsample);
144
    responses = GET_SLOT(learnsample, PL2_responsesSym);
145
    inputs = GET_SLOT(learnsample, PL2_inputsSym);
146
    
147
    /* y = get_transformation(responses, 1); */
148
    y = get_test_trafo(responses);
149
    
150
    expcovinf = GET_SLOT(fitmem, PL2_expcovinfSym);
151
    C_ExpectCovarInfluence(REAL(y), ncol(y), REAL(weights), 
152
                           nobs, expcovinf);
153
    
154
    if (REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0] < minsplit) {
155
        for (j = 0; j < ninputs; j++) {
156
            ans_teststat[j] = 0.0;
157
            ans_criterion[j] = 0.0;
158
        }
159
    } else {
160
161
        dontuse = INTEGER(get_dontuse(fitmem));
162
        dontusetmp = INTEGER(get_dontusetmp(fitmem));
163
    
164
        for (j = 0; j < ninputs; j++) dontusetmp[j] = !dontuse[j];
165
    
166
        /* random forest */
167
        RANDOM = get_randomsplits(gtctrl);
168
        Smtry = get_mtry(gtctrl);
169
        if (LENGTH(Smtry) == 1) {
170
            mtry = INTEGER(Smtry)[0];
171
        } else {
172
            /* mtry may vary with tree depth */
173
            depth = (depth <= LENGTH(Smtry)) ? depth : LENGTH(Smtry);
174
            mtry = INTEGER(get_mtry(gtctrl))[depth - 1];
175
        }
176
        if (RANDOM & (mtry > ninputs)) {
177
            warning("mtry is larger than ninputs, using mtry = inputs");
178
            mtry = ninputs;
179
            RANDOM = 0;
180
        }
181
        if (RANDOM) {
182
            index = Calloc(ninputs, int);
183
            randomvar = Calloc(mtry, int);
184
            C_SampleNoReplace(index, ninputs, mtry, randomvar);
185
            j = 0;
186
            for (k = 0; k < mtry; k++) {
187
                j = randomvar[k];
188
                while(dontuse[j] && j < ninputs) j++;
189
                if (j == ninputs) 
190
                    error("not enough variables to sample from");
191
                dontusetmp[j] = 0;
192
            }
193
            Free(index);
194
            Free(randomvar);
195
        }
196
197
        for (j = 1; j <= ninputs; j++) {
198
199
            if ((RANDOM && dontusetmp[j - 1]) || dontuse[j - 1]) {
200
                ans_teststat[j - 1] = 0.0;
201
                ans_criterion[j - 1] = 0.0;
202
                continue; 
203
            }
204
        
205
            x = get_transformation(inputs, j);
206
207
            xmem = get_varmemory(fitmem, j);
208
            if (!has_missings(inputs, j)) {
209
                C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y),
210
                                REAL(weights), nrow(x), !RECALC, expcovinf,
211
                                xmem);
212
            } else {
213
                thisweights = C_tempweights(j, weights, fitmem, inputs);
214
215
                /* check if minsplit criterion is still met 
216
                   in the presence of missing values
217
                   bug spotted by Han Lee <Han.Lee@geodecapital.com>
218
                       fixed 2006-08-31
219
                */
220
                stweights = 0.0;
221
                for (i = 0; i < nobs; i++) stweights += thisweights[i];
222
                if (stweights < minsplit) {
223
                    ans_teststat[j - 1] = 0.0;
224
                    ans_criterion[j - 1] = 0.0;
225
                    continue; 
226
                }
227
228
                C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y),
229
                                thisweights, nrow(x), RECALC, 
230
                                GET_SLOT(xmem, PL2_expcovinfSym),
231
                                xmem);
232
            }
233
234
            /* teststat = "quad" 
235
               ATTENTION: we mess with xmem by putting elements with zero variances last
236
                          but C_Node() reuses the original xmem for setting up 
237
                          categorical splits */
238
            if (get_teststat(varctrl) == 2)
239
                C_LinStatExpCovMPinv(xmem, get_tol(varctrl));
240
                
241
            C_TeststatCriterion(xmem, varctrl, &ans_teststat[j - 1], 
242
                                &ans_criterion[j - 1]);
243
                            
244
            /* teststat = "quad"
245
               make sure that the test statistics etc match the original order of levels 
246
               <FIXME> can we avoid to compute these things twice??? */ 
247
            if (get_teststat(varctrl) == 2) {
248
                if (!has_missings(inputs, j)) {
249
                    C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y),
250
                                    REAL(weights), nrow(x), !RECALC, expcovinf,
251
                                    xmem);
252
                } else {
253
                    C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y),
254
                                    thisweights, nrow(x), RECALC, 
255
                                    GET_SLOT(xmem, PL2_expcovinfSym),
256
                                    xmem);
257
                }
258
            }
259
            /* </FIXME> */
260
        }                
261
262
        type = get_testtype(gtctrl);
263
        switch(type) {
264
            /* Bonferroni: p_adj = 1 - (1 - p)^k */
265
            case BONFERRONI: 
266
                    for (j = 0; j < ninputs; j++)
267
                        ans_criterion[j] = R_pow_di(ans_criterion[j], ninputs);
268
                    break;
269
            /* Monte-Carlo */
270
            case MONTECARLO: 
271
                    pvaltmp = Calloc(ninputs, double);
272
                    C_MonteCarlo(ans_criterion, learnsample, weights, fitmem, 
273
                                 varctrl, gtctrl, pvaltmp);
274
                    for (j = 0; j < ninputs; j++)
275
                        ans_criterion[j] = 1 - pvaltmp[j];
276
                    Free(pvaltmp);
277
                    break;
278
            /* aggregated */
279
            case AGGREGATED: 
280
                    error("C_GlobalTest: aggregated global test not yet implemented");
281
                    break;
282
            /* raw */
283
            case UNIVARIATE: break;
284
            case TESTSTATISTIC: break;
285
            default: error("C_GlobalTest: undefined value for type argument");
286
                     break;
287
        }
288
    }
289
}
290
291
292
/**
293
    R-interface to C_GlobalTest \n
294
    *\param learnsample an object of class `LearningSample'
295
    *\param weights case weights
296
    *\param fitmem an object of class `TreeFitMemory'
297
    *\param varctrl an object of class `VariableControl'
298
    *\param gtctrl an object of class `GlobalTestControl'
299
*/
300
301
SEXP R_GlobalTest(SEXP learnsample, SEXP weights, SEXP fitmem, 
302
                  SEXP varctrl, SEXP gtctrl) {
303
304
    SEXP ans, teststat, criterion;
305
306
    GetRNGstate();
307
308
    PROTECT(ans = allocVector(VECSXP, 2));
309
    SET_VECTOR_ELT(ans, 0, 
310
        teststat = allocVector(REALSXP, get_ninputs(learnsample)));
311
    SET_VECTOR_ELT(ans, 1, 
312
        criterion = allocVector(REALSXP, get_ninputs(learnsample)));
313
314
    C_GlobalTest(learnsample, weights, fitmem, varctrl, gtctrl, 0, 
315
                 REAL(teststat), REAL(criterion), 1);
316
                 
317
    PutRNGstate();
318
    
319
    UNPROTECT(1);
320
    return(ans);
321
}