|
a |
|
b/partyMod/src/IndependenceTest.c |
|
|
1 |
|
|
|
2 |
/** |
|
|
3 |
Functions for variable selection in each node of a tree |
|
|
4 |
*\file IndependenceTest.c |
|
|
5 |
*\author $Author$ |
|
|
6 |
*\date $Date$ |
|
|
7 |
*/ |
|
|
8 |
|
|
|
9 |
#include "party.h" |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
/** |
|
|
13 |
Computes the test statistic and, if requested, the corresponding |
|
|
14 |
P-value for a linear statistic \n |
|
|
15 |
*\param linexpcov an object of class `LinStatExpectCovar' |
|
|
16 |
*\param varctrl an object of class `VariableControl' |
|
|
17 |
*\param ans_teststat; return value, the test statistic |
|
|
18 |
*\param ans_pvalue; return value, the p-value |
|
|
19 |
*/ |
|
|
20 |
|
|
|
21 |
void C_TeststatPvalue(const SEXP linexpcov, const SEXP varctrl, |
|
|
22 |
double *ans_teststat, double *ans_pvalue) { |
|
|
23 |
|
|
|
24 |
double releps, abseps, tol; |
|
|
25 |
int maxpts; |
|
|
26 |
|
|
|
27 |
maxpts = get_maxpts(varctrl); |
|
|
28 |
tol = get_tol(varctrl); |
|
|
29 |
abseps = get_abseps(varctrl); |
|
|
30 |
releps = get_releps(varctrl); |
|
|
31 |
|
|
|
32 |
/* compute the test statistic */ |
|
|
33 |
ans_teststat[0] = C_TestStatistic(linexpcov, get_teststat(varctrl), |
|
|
34 |
get_tol(varctrl)); |
|
|
35 |
|
|
|
36 |
/* compute the p-value if requested */ |
|
|
37 |
if (get_pvalue(varctrl)) |
|
|
38 |
ans_pvalue[0] = C_ConditionalPvalue(ans_teststat[0], linexpcov, |
|
|
39 |
get_teststat(varctrl), |
|
|
40 |
tol, &maxpts, &releps, &abseps); |
|
|
41 |
else |
|
|
42 |
ans_pvalue[0] = 1.0; |
|
|
43 |
} |
|
|
44 |
|
|
|
45 |
/** |
|
|
46 |
Computes the test statistic and the node criterion \n |
|
|
47 |
*\param linexpcov an object of class `LinStatExpectCovar' |
|
|
48 |
*\param varctrl an object of class `VariableControl' |
|
|
49 |
*\param ans_teststat; return value, the test statistic |
|
|
50 |
*\param ans_criterion; return value, thep-value |
|
|
51 |
*/ |
|
|
52 |
|
|
|
53 |
void C_TeststatCriterion(const SEXP linexpcov, const SEXP varctrl, |
|
|
54 |
double *ans_teststat, double *ans_criterion) { |
|
|
55 |
|
|
|
56 |
C_TeststatPvalue(linexpcov, varctrl, ans_teststat, ans_criterion); |
|
|
57 |
|
|
|
58 |
/* the node criterion is to be MAXIMISED, |
|
|
59 |
i.e. 1-pvalue or test statistic \in \[0, \infty\] */ |
|
|
60 |
if (get_pvalue(varctrl)) |
|
|
61 |
ans_criterion[0] = 1 - ans_criterion[0]; |
|
|
62 |
else |
|
|
63 |
ans_criterion[0] = ans_teststat[0]; |
|
|
64 |
|
|
|
65 |
} |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
/** |
|
|
69 |
Test of independence between x and y \n |
|
|
70 |
*\param x values of the transformation |
|
|
71 |
*\param y values of the influence function |
|
|
72 |
*\param weights case weights |
|
|
73 |
*\param linexpcov an object of class `VariableControl' for T |
|
|
74 |
*\param varctrl an object of class `VariableControl' |
|
|
75 |
*\param ans; return value, a double vector (teststat, pvalue) |
|
|
76 |
*/ |
|
|
77 |
|
|
|
78 |
void C_IndependenceTest(const SEXP x, const SEXP y, const SEXP weights, |
|
|
79 |
SEXP linexpcov, SEXP varctrl, |
|
|
80 |
SEXP ans) { |
|
|
81 |
|
|
|
82 |
/* compute linear statistic and its conditional expectation and |
|
|
83 |
covariance |
|
|
84 |
*/ |
|
|
85 |
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), |
|
|
86 |
REAL(weights), nrow(x), 1, |
|
|
87 |
GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov); |
|
|
88 |
|
|
|
89 |
/* compute test statistic */ |
|
|
90 |
if (get_teststat(varctrl) == 2) |
|
|
91 |
C_LinStatExpCovMPinv(linexpcov, get_tol(varctrl)); |
|
|
92 |
C_TeststatPvalue(linexpcov, varctrl, &REAL(ans)[0], &REAL(ans)[1]); |
|
|
93 |
} |
|
|
94 |
|
|
|
95 |
|
|
|
96 |
/** |
|
|
97 |
R-interface to C_IndependenceTest \n |
|
|
98 |
*\param x values of the transformation |
|
|
99 |
*\param y values of the influence function |
|
|
100 |
*\param weights case weights |
|
|
101 |
*\param linexpcov an object of class `VariableControl' for T |
|
|
102 |
*\param varctrl an object of class `VariableControl' |
|
|
103 |
*/ |
|
|
104 |
|
|
|
105 |
SEXP R_IndependenceTest(SEXP x, SEXP y, SEXP weights, SEXP linexpcov, SEXP varctrl) { |
|
|
106 |
|
|
|
107 |
SEXP ans; |
|
|
108 |
|
|
|
109 |
PROTECT(ans = allocVector(REALSXP, 2)); |
|
|
110 |
C_IndependenceTest(x, y, weights, linexpcov, varctrl, ans); |
|
|
111 |
UNPROTECT(1); |
|
|
112 |
return(ans); |
|
|
113 |
} |
|
|
114 |
|
|
|
115 |
|
|
|
116 |
/** |
|
|
117 |
Perform a global test on independence of a response and multiple inputs \n |
|
|
118 |
*\param learnsample an object of class `LearningSample' |
|
|
119 |
*\param weights case weights |
|
|
120 |
*\param fitmem an object of class `TreeFitMemory' |
|
|
121 |
*\param varctrl an object of class `VariableControl' |
|
|
122 |
*\param gtctrl an object of class `GlobalTestControl' |
|
|
123 |
*\param minsplit minimum sum of weights to proceed |
|
|
124 |
*\param ans_teststat return value; vector of test statistics |
|
|
125 |
*\param ans_criterion return value; vector of node criteria |
|
|
126 |
(adjusted) pvalues or raw test statistics |
|
|
127 |
*\param depth an integer giving the depth of the current node |
|
|
128 |
*/ |
|
|
129 |
|
|
|
130 |
void C_GlobalTest(const SEXP learnsample, const SEXP weights, |
|
|
131 |
SEXP fitmem, const SEXP varctrl, |
|
|
132 |
const SEXP gtctrl, const double minsplit, |
|
|
133 |
double *ans_teststat, double *ans_criterion, int depth) { |
|
|
134 |
|
|
|
135 |
int ninputs, nobs, j, i, k, RECALC = 1, type; |
|
|
136 |
SEXP responses, inputs, y, x, xmem, expcovinf; |
|
|
137 |
SEXP Smtry; |
|
|
138 |
double *thisweights, *pvaltmp, stweights = 0.0; |
|
|
139 |
int RANDOM, mtry, *randomvar, *index; |
|
|
140 |
int *dontuse, *dontusetmp; |
|
|
141 |
|
|
|
142 |
ninputs = get_ninputs(learnsample); |
|
|
143 |
nobs = get_nobs(learnsample); |
|
|
144 |
responses = GET_SLOT(learnsample, PL2_responsesSym); |
|
|
145 |
inputs = GET_SLOT(learnsample, PL2_inputsSym); |
|
|
146 |
|
|
|
147 |
/* y = get_transformation(responses, 1); */ |
|
|
148 |
y = get_test_trafo(responses); |
|
|
149 |
|
|
|
150 |
expcovinf = GET_SLOT(fitmem, PL2_expcovinfSym); |
|
|
151 |
C_ExpectCovarInfluence(REAL(y), ncol(y), REAL(weights), |
|
|
152 |
nobs, expcovinf); |
|
|
153 |
|
|
|
154 |
if (REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0] < minsplit) { |
|
|
155 |
for (j = 0; j < ninputs; j++) { |
|
|
156 |
ans_teststat[j] = 0.0; |
|
|
157 |
ans_criterion[j] = 0.0; |
|
|
158 |
} |
|
|
159 |
} else { |
|
|
160 |
|
|
|
161 |
dontuse = INTEGER(get_dontuse(fitmem)); |
|
|
162 |
dontusetmp = INTEGER(get_dontusetmp(fitmem)); |
|
|
163 |
|
|
|
164 |
for (j = 0; j < ninputs; j++) dontusetmp[j] = !dontuse[j]; |
|
|
165 |
|
|
|
166 |
/* random forest */ |
|
|
167 |
RANDOM = get_randomsplits(gtctrl); |
|
|
168 |
Smtry = get_mtry(gtctrl); |
|
|
169 |
if (LENGTH(Smtry) == 1) { |
|
|
170 |
mtry = INTEGER(Smtry)[0]; |
|
|
171 |
} else { |
|
|
172 |
/* mtry may vary with tree depth */ |
|
|
173 |
depth = (depth <= LENGTH(Smtry)) ? depth : LENGTH(Smtry); |
|
|
174 |
mtry = INTEGER(get_mtry(gtctrl))[depth - 1]; |
|
|
175 |
} |
|
|
176 |
if (RANDOM & (mtry > ninputs)) { |
|
|
177 |
warning("mtry is larger than ninputs, using mtry = inputs"); |
|
|
178 |
mtry = ninputs; |
|
|
179 |
RANDOM = 0; |
|
|
180 |
} |
|
|
181 |
if (RANDOM) { |
|
|
182 |
index = Calloc(ninputs, int); |
|
|
183 |
randomvar = Calloc(mtry, int); |
|
|
184 |
C_SampleNoReplace(index, ninputs, mtry, randomvar); |
|
|
185 |
j = 0; |
|
|
186 |
for (k = 0; k < mtry; k++) { |
|
|
187 |
j = randomvar[k]; |
|
|
188 |
while(dontuse[j] && j < ninputs) j++; |
|
|
189 |
if (j == ninputs) |
|
|
190 |
error("not enough variables to sample from"); |
|
|
191 |
dontusetmp[j] = 0; |
|
|
192 |
} |
|
|
193 |
Free(index); |
|
|
194 |
Free(randomvar); |
|
|
195 |
} |
|
|
196 |
|
|
|
197 |
for (j = 1; j <= ninputs; j++) { |
|
|
198 |
|
|
|
199 |
if ((RANDOM && dontusetmp[j - 1]) || dontuse[j - 1]) { |
|
|
200 |
ans_teststat[j - 1] = 0.0; |
|
|
201 |
ans_criterion[j - 1] = 0.0; |
|
|
202 |
continue; |
|
|
203 |
} |
|
|
204 |
|
|
|
205 |
x = get_transformation(inputs, j); |
|
|
206 |
|
|
|
207 |
xmem = get_varmemory(fitmem, j); |
|
|
208 |
if (!has_missings(inputs, j)) { |
|
|
209 |
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), |
|
|
210 |
REAL(weights), nrow(x), !RECALC, expcovinf, |
|
|
211 |
xmem); |
|
|
212 |
} else { |
|
|
213 |
thisweights = C_tempweights(j, weights, fitmem, inputs); |
|
|
214 |
|
|
|
215 |
/* check if minsplit criterion is still met |
|
|
216 |
in the presence of missing values |
|
|
217 |
bug spotted by Han Lee <Han.Lee@geodecapital.com> |
|
|
218 |
fixed 2006-08-31 |
|
|
219 |
*/ |
|
|
220 |
stweights = 0.0; |
|
|
221 |
for (i = 0; i < nobs; i++) stweights += thisweights[i]; |
|
|
222 |
if (stweights < minsplit) { |
|
|
223 |
ans_teststat[j - 1] = 0.0; |
|
|
224 |
ans_criterion[j - 1] = 0.0; |
|
|
225 |
continue; |
|
|
226 |
} |
|
|
227 |
|
|
|
228 |
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), |
|
|
229 |
thisweights, nrow(x), RECALC, |
|
|
230 |
GET_SLOT(xmem, PL2_expcovinfSym), |
|
|
231 |
xmem); |
|
|
232 |
} |
|
|
233 |
|
|
|
234 |
/* teststat = "quad" |
|
|
235 |
ATTENTION: we mess with xmem by putting elements with zero variances last |
|
|
236 |
but C_Node() reuses the original xmem for setting up |
|
|
237 |
categorical splits */ |
|
|
238 |
if (get_teststat(varctrl) == 2) |
|
|
239 |
C_LinStatExpCovMPinv(xmem, get_tol(varctrl)); |
|
|
240 |
|
|
|
241 |
C_TeststatCriterion(xmem, varctrl, &ans_teststat[j - 1], |
|
|
242 |
&ans_criterion[j - 1]); |
|
|
243 |
|
|
|
244 |
/* teststat = "quad" |
|
|
245 |
make sure that the test statistics etc match the original order of levels |
|
|
246 |
<FIXME> can we avoid to compute these things twice??? */ |
|
|
247 |
if (get_teststat(varctrl) == 2) { |
|
|
248 |
if (!has_missings(inputs, j)) { |
|
|
249 |
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), |
|
|
250 |
REAL(weights), nrow(x), !RECALC, expcovinf, |
|
|
251 |
xmem); |
|
|
252 |
} else { |
|
|
253 |
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), |
|
|
254 |
thisweights, nrow(x), RECALC, |
|
|
255 |
GET_SLOT(xmem, PL2_expcovinfSym), |
|
|
256 |
xmem); |
|
|
257 |
} |
|
|
258 |
} |
|
|
259 |
/* </FIXME> */ |
|
|
260 |
} |
|
|
261 |
|
|
|
262 |
type = get_testtype(gtctrl); |
|
|
263 |
switch(type) { |
|
|
264 |
/* Bonferroni: p_adj = 1 - (1 - p)^k */ |
|
|
265 |
case BONFERRONI: |
|
|
266 |
for (j = 0; j < ninputs; j++) |
|
|
267 |
ans_criterion[j] = R_pow_di(ans_criterion[j], ninputs); |
|
|
268 |
break; |
|
|
269 |
/* Monte-Carlo */ |
|
|
270 |
case MONTECARLO: |
|
|
271 |
pvaltmp = Calloc(ninputs, double); |
|
|
272 |
C_MonteCarlo(ans_criterion, learnsample, weights, fitmem, |
|
|
273 |
varctrl, gtctrl, pvaltmp); |
|
|
274 |
for (j = 0; j < ninputs; j++) |
|
|
275 |
ans_criterion[j] = 1 - pvaltmp[j]; |
|
|
276 |
Free(pvaltmp); |
|
|
277 |
break; |
|
|
278 |
/* aggregated */ |
|
|
279 |
case AGGREGATED: |
|
|
280 |
error("C_GlobalTest: aggregated global test not yet implemented"); |
|
|
281 |
break; |
|
|
282 |
/* raw */ |
|
|
283 |
case UNIVARIATE: break; |
|
|
284 |
case TESTSTATISTIC: break; |
|
|
285 |
default: error("C_GlobalTest: undefined value for type argument"); |
|
|
286 |
break; |
|
|
287 |
} |
|
|
288 |
} |
|
|
289 |
} |
|
|
290 |
|
|
|
291 |
|
|
|
292 |
/** |
|
|
293 |
R-interface to C_GlobalTest \n |
|
|
294 |
*\param learnsample an object of class `LearningSample' |
|
|
295 |
*\param weights case weights |
|
|
296 |
*\param fitmem an object of class `TreeFitMemory' |
|
|
297 |
*\param varctrl an object of class `VariableControl' |
|
|
298 |
*\param gtctrl an object of class `GlobalTestControl' |
|
|
299 |
*/ |
|
|
300 |
|
|
|
301 |
SEXP R_GlobalTest(SEXP learnsample, SEXP weights, SEXP fitmem, |
|
|
302 |
SEXP varctrl, SEXP gtctrl) { |
|
|
303 |
|
|
|
304 |
SEXP ans, teststat, criterion; |
|
|
305 |
|
|
|
306 |
GetRNGstate(); |
|
|
307 |
|
|
|
308 |
PROTECT(ans = allocVector(VECSXP, 2)); |
|
|
309 |
SET_VECTOR_ELT(ans, 0, |
|
|
310 |
teststat = allocVector(REALSXP, get_ninputs(learnsample))); |
|
|
311 |
SET_VECTOR_ELT(ans, 1, |
|
|
312 |
criterion = allocVector(REALSXP, get_ninputs(learnsample))); |
|
|
313 |
|
|
|
314 |
C_GlobalTest(learnsample, weights, fitmem, varctrl, gtctrl, 0, |
|
|
315 |
REAL(teststat), REAL(criterion), 1); |
|
|
316 |
|
|
|
317 |
PutRNGstate(); |
|
|
318 |
|
|
|
319 |
UNPROTECT(1); |
|
|
320 |
return(ans); |
|
|
321 |
} |