[fbf06f]: / partyMod / src / IndependenceTest.c

Download this file

322 lines (267 with data), 11.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
/**
Functions for variable selection in each node of a tree
*\file IndependenceTest.c
*\author $Author$
*\date $Date$
*/
#include "party.h"
/**
Computes the test statistic and, if requested, the corresponding
P-value for a linear statistic \n
*\param linexpcov an object of class `LinStatExpectCovar'
*\param varctrl an object of class `VariableControl'
*\param ans_teststat; return value, the test statistic
*\param ans_pvalue; return value, the p-value
*/
void C_TeststatPvalue(const SEXP linexpcov, const SEXP varctrl,
double *ans_teststat, double *ans_pvalue) {
double releps, abseps, tol;
int maxpts;
maxpts = get_maxpts(varctrl);
tol = get_tol(varctrl);
abseps = get_abseps(varctrl);
releps = get_releps(varctrl);
/* compute the test statistic */
ans_teststat[0] = C_TestStatistic(linexpcov, get_teststat(varctrl),
get_tol(varctrl));
/* compute the p-value if requested */
if (get_pvalue(varctrl))
ans_pvalue[0] = C_ConditionalPvalue(ans_teststat[0], linexpcov,
get_teststat(varctrl),
tol, &maxpts, &releps, &abseps);
else
ans_pvalue[0] = 1.0;
}
/**
Computes the test statistic and the node criterion \n
*\param linexpcov an object of class `LinStatExpectCovar'
*\param varctrl an object of class `VariableControl'
*\param ans_teststat; return value, the test statistic
*\param ans_criterion; return value, thep-value
*/
void C_TeststatCriterion(const SEXP linexpcov, const SEXP varctrl,
double *ans_teststat, double *ans_criterion) {
C_TeststatPvalue(linexpcov, varctrl, ans_teststat, ans_criterion);
/* the node criterion is to be MAXIMISED,
i.e. 1-pvalue or test statistic \in \[0, \infty\] */
if (get_pvalue(varctrl))
ans_criterion[0] = 1 - ans_criterion[0];
else
ans_criterion[0] = ans_teststat[0];
}
/**
Test of independence between x and y \n
*\param x values of the transformation
*\param y values of the influence function
*\param weights case weights
*\param linexpcov an object of class `VariableControl' for T
*\param varctrl an object of class `VariableControl'
*\param ans; return value, a double vector (teststat, pvalue)
*/
void C_IndependenceTest(const SEXP x, const SEXP y, const SEXP weights,
SEXP linexpcov, SEXP varctrl,
SEXP ans) {
/* compute linear statistic and its conditional expectation and
covariance
*/
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y),
REAL(weights), nrow(x), 1,
GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
/* compute test statistic */
if (get_teststat(varctrl) == 2)
C_LinStatExpCovMPinv(linexpcov, get_tol(varctrl));
C_TeststatPvalue(linexpcov, varctrl, &REAL(ans)[0], &REAL(ans)[1]);
}
/**
R-interface to C_IndependenceTest \n
*\param x values of the transformation
*\param y values of the influence function
*\param weights case weights
*\param linexpcov an object of class `VariableControl' for T
*\param varctrl an object of class `VariableControl'
*/
SEXP R_IndependenceTest(SEXP x, SEXP y, SEXP weights, SEXP linexpcov, SEXP varctrl) {
SEXP ans;
PROTECT(ans = allocVector(REALSXP, 2));
C_IndependenceTest(x, y, weights, linexpcov, varctrl, ans);
UNPROTECT(1);
return(ans);
}
/**
Perform a global test on independence of a response and multiple inputs \n
*\param learnsample an object of class `LearningSample'
*\param weights case weights
*\param fitmem an object of class `TreeFitMemory'
*\param varctrl an object of class `VariableControl'
*\param gtctrl an object of class `GlobalTestControl'
*\param minsplit minimum sum of weights to proceed
*\param ans_teststat return value; vector of test statistics
*\param ans_criterion return value; vector of node criteria
(adjusted) pvalues or raw test statistics
*\param depth an integer giving the depth of the current node
*/
void C_GlobalTest(const SEXP learnsample, const SEXP weights,
SEXP fitmem, const SEXP varctrl,
const SEXP gtctrl, const double minsplit,
double *ans_teststat, double *ans_criterion, int depth) {
int ninputs, nobs, j, i, k, RECALC = 1, type;
SEXP responses, inputs, y, x, xmem, expcovinf;
SEXP Smtry;
double *thisweights, *pvaltmp, stweights = 0.0;
int RANDOM, mtry, *randomvar, *index;
int *dontuse, *dontusetmp;
ninputs = get_ninputs(learnsample);
nobs = get_nobs(learnsample);
responses = GET_SLOT(learnsample, PL2_responsesSym);
inputs = GET_SLOT(learnsample, PL2_inputsSym);
/* y = get_transformation(responses, 1); */
y = get_test_trafo(responses);
expcovinf = GET_SLOT(fitmem, PL2_expcovinfSym);
C_ExpectCovarInfluence(REAL(y), ncol(y), REAL(weights),
nobs, expcovinf);
if (REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0] < minsplit) {
for (j = 0; j < ninputs; j++) {
ans_teststat[j] = 0.0;
ans_criterion[j] = 0.0;
}
} else {
dontuse = INTEGER(get_dontuse(fitmem));
dontusetmp = INTEGER(get_dontusetmp(fitmem));
for (j = 0; j < ninputs; j++) dontusetmp[j] = !dontuse[j];
/* random forest */
RANDOM = get_randomsplits(gtctrl);
Smtry = get_mtry(gtctrl);
if (LENGTH(Smtry) == 1) {
mtry = INTEGER(Smtry)[0];
} else {
/* mtry may vary with tree depth */
depth = (depth <= LENGTH(Smtry)) ? depth : LENGTH(Smtry);
mtry = INTEGER(get_mtry(gtctrl))[depth - 1];
}
if (RANDOM & (mtry > ninputs)) {
warning("mtry is larger than ninputs, using mtry = inputs");
mtry = ninputs;
RANDOM = 0;
}
if (RANDOM) {
index = Calloc(ninputs, int);
randomvar = Calloc(mtry, int);
C_SampleNoReplace(index, ninputs, mtry, randomvar);
j = 0;
for (k = 0; k < mtry; k++) {
j = randomvar[k];
while(dontuse[j] && j < ninputs) j++;
if (j == ninputs)
error("not enough variables to sample from");
dontusetmp[j] = 0;
}
Free(index);
Free(randomvar);
}
for (j = 1; j <= ninputs; j++) {
if ((RANDOM && dontusetmp[j - 1]) || dontuse[j - 1]) {
ans_teststat[j - 1] = 0.0;
ans_criterion[j - 1] = 0.0;
continue;
}
x = get_transformation(inputs, j);
xmem = get_varmemory(fitmem, j);
if (!has_missings(inputs, j)) {
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y),
REAL(weights), nrow(x), !RECALC, expcovinf,
xmem);
} else {
thisweights = C_tempweights(j, weights, fitmem, inputs);
/* check if minsplit criterion is still met
in the presence of missing values
bug spotted by Han Lee <Han.Lee@geodecapital.com>
fixed 2006-08-31
*/
stweights = 0.0;
for (i = 0; i < nobs; i++) stweights += thisweights[i];
if (stweights < minsplit) {
ans_teststat[j - 1] = 0.0;
ans_criterion[j - 1] = 0.0;
continue;
}
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y),
thisweights, nrow(x), RECALC,
GET_SLOT(xmem, PL2_expcovinfSym),
xmem);
}
/* teststat = "quad"
ATTENTION: we mess with xmem by putting elements with zero variances last
but C_Node() reuses the original xmem for setting up
categorical splits */
if (get_teststat(varctrl) == 2)
C_LinStatExpCovMPinv(xmem, get_tol(varctrl));
C_TeststatCriterion(xmem, varctrl, &ans_teststat[j - 1],
&ans_criterion[j - 1]);
/* teststat = "quad"
make sure that the test statistics etc match the original order of levels
<FIXME> can we avoid to compute these things twice??? */
if (get_teststat(varctrl) == 2) {
if (!has_missings(inputs, j)) {
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y),
REAL(weights), nrow(x), !RECALC, expcovinf,
xmem);
} else {
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y),
thisweights, nrow(x), RECALC,
GET_SLOT(xmem, PL2_expcovinfSym),
xmem);
}
}
/* </FIXME> */
}
type = get_testtype(gtctrl);
switch(type) {
/* Bonferroni: p_adj = 1 - (1 - p)^k */
case BONFERRONI:
for (j = 0; j < ninputs; j++)
ans_criterion[j] = R_pow_di(ans_criterion[j], ninputs);
break;
/* Monte-Carlo */
case MONTECARLO:
pvaltmp = Calloc(ninputs, double);
C_MonteCarlo(ans_criterion, learnsample, weights, fitmem,
varctrl, gtctrl, pvaltmp);
for (j = 0; j < ninputs; j++)
ans_criterion[j] = 1 - pvaltmp[j];
Free(pvaltmp);
break;
/* aggregated */
case AGGREGATED:
error("C_GlobalTest: aggregated global test not yet implemented");
break;
/* raw */
case UNIVARIATE: break;
case TESTSTATISTIC: break;
default: error("C_GlobalTest: undefined value for type argument");
break;
}
}
}
/**
R-interface to C_GlobalTest \n
*\param learnsample an object of class `LearningSample'
*\param weights case weights
*\param fitmem an object of class `TreeFitMemory'
*\param varctrl an object of class `VariableControl'
*\param gtctrl an object of class `GlobalTestControl'
*/
SEXP R_GlobalTest(SEXP learnsample, SEXP weights, SEXP fitmem,
SEXP varctrl, SEXP gtctrl) {
SEXP ans, teststat, criterion;
GetRNGstate();
PROTECT(ans = allocVector(VECSXP, 2));
SET_VECTOR_ELT(ans, 0,
teststat = allocVector(REALSXP, get_ninputs(learnsample)));
SET_VECTOR_ELT(ans, 1,
criterion = allocVector(REALSXP, get_ninputs(learnsample)));
C_GlobalTest(learnsample, weights, fitmem, varctrl, gtctrl, 0,
REAL(teststat), REAL(criterion), 1);
PutRNGstate();
UNPROTECT(1);
return(ans);
}