/**
Binary splits
*\file Splits.c
*\author $Author$
*\date $Date$
*/
#include "party.h"
/**
Search for a cutpoint in a ordered variable x maximizing a two-sample
statistic w.r.t. (the influence function of ) the response variable y.
*\param x raw numeric measurements
*\param p dimension of the transformation
*\param y values of the influence function
*\param q dimension of the influence function
*\param weights case weights
*\param n number of observations
*\param orderx the ordering of the transformations, i.e. R> order(x)
*\param splitctrl an object of class `SplitControl'
*\param linexpcov2sample an (uninitialized) object of class
`LinStatExpectCovar' with p = 1
*\param expcovinf an initialized object of class `ExpectCovarInfluence'
*\param cutpoint return value; pointer to a double for the cutpoint in x
*\param maxstat return value; pointer to a double for the
maximal test statistic
*\param statistics return value; pointer to a n-dim double
for the statistics
*/
void C_split(const double *x, int p,
const double *y, int q,
const double *weights, int n,
const int *orderx,
SEXP splitctrl, SEXP linexpcov2sample,
SEXP expcovinf, double *cutpoint, double *maxstat,
double *statistics) {
double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar,
tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp;
double minobs, maxobs, xmax;
int lastj, i, j, k;
if (p != 1) error("C_split: p not equal to one");
tol = get_tol(splitctrl);
/* init statistics and determine the maximal value with positive weight
since we can't choose this one as cutpoint
*/
xmax = 0.0;
for (i = 0; i < n; i++) {
statistics[i] = 0.0;
if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i];
}
/* we already have expecation and covariance of the response
* values and the sum of the weights */
dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym));
dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym));
sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0];
/* if there is something to split */
if (sweights > 1) {
/* we need to ensure that at least minbucket weights
are there to split (either left or right) */
minprob = get_minprob(splitctrl);
minbucket = get_minbucket(splitctrl);
minobs = sweights * minprob + 1.0;
if (minobs < minbucket)
minobs = minbucket;
maxobs = sweights * (1 - minprob) - 1.0;
if (maxobs > sweights - minbucket)
maxobs = sweights - minbucket;
f1 = (double) sweights / (sweights - 1);
f2 = 1.0 / (sweights - 1);
w = 0.0;
/* pointers to the R-objects */
dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym));
for (k = 0; k < q; k++) dlinstat[k] = 0.0;
dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym));
dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym));
tx = 0.0;
lastj = 0;
/* for all possible cutpoints (defined by the observations x) */
for (i = 0; i < (n - 1); i++) {
/* the ordering of the ith observation */
j = orderx[i] - 1;
/* if the corresponding weight is zero */
if (weights[j] == 0.0) continue;
/* just a check: can be removed later */
if (w > 0 && x[j] < tx)
warning("C_split: inconsistent ordering: %f < %f!\n",
x[j], tx);
/* handle ties: delete the entry of the last visited observation
(take care of zero weights!) */
if (w > 0 && x[j] == tx)
statistics[lastj] = 0.0;
/* store the value and position of the j smallest observation */
tx = x[j];
lastj = j;
w += weights[j];
/* do not consider those splits */
if (w > maxobs || x[j] >= xmax) break;
/* compute the linear statistic and expectation and
* covariance if needed */
for (k = 0; k < q; k++)
dlinstat[k] += y[n * k + j] * weights[j];
if (w >= minobs) {
for (k = 0; k < q; k++)
dexpect[k] = w * dExp_y[k];
f1w = f1 * w;
f2ww = f2 * w * w;
for (k = 0; k < q*q; k++)
dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k];
} else {
continue;
}
/* the absolute standardized test statistic, to be maximized */
/* statistics[j] = C_maxabsTestStatistic(dlinstat,
dexpect, dcovar, q, tol); */
/* much faster but uses maxabs always*/
statistics[j] = 0.0;
for (k = 0; k < q; k++) {
if (dcovar[k * q + k] <= tol) continue;
tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]);
if (statistics[j] < tmp) statistics[j] = tmp;
}
}
/* search for the maximum and the best separating cutpoint */
/* <FIXME> the result might differ between 32 and 64bit systems
because of rounding errors in 'statistics' */
maxstat[0] = 0.0;
for (i = 0; i < n; i++) {
if (statistics[i] > maxstat[0]) {
maxstat[0] = statistics[i];
cutpoint[0] = x[i];
}
}
/* </FIXME> */
}
}
/**
R-interface to C_split (does not handle ordered y's)
*\param x values of the transformation
*\param y values of the influence function
*\param weights case weights
*\param orderx the ordering of the transformations, i.e. R> order(x)
*\param linexpcov2sample an (uninitialized) object of class
`LinStatExpectCovar' with p = 1
*\param expcovinf an initialized object of class `ExpectCovarInfluence'
*\param splitctrl an object of class `SplitControl'
*/
SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample,
SEXP expcovinf, SEXP splitctrl) {
SEXP ans, cutpoint, maxstat, statistics;
PROTECT(ans = allocVector(VECSXP, 3));
SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
INTEGER(orderx), splitctrl, linexpcov2sample, expcovinf,
REAL(cutpoint), REAL(maxstat), REAL(statistics));
UNPROTECT(1);
return(ans);
}
/**
Search for a cutpoint in a unordered factor x maximizing a two-sample
statistic w.r.t. (the influence function of ) the response variable y.
*\param codingx the coding of x, i.e. as.numeric(x)
*\param p dimension of the transformation
*\param y values of the influence function
*\param q dimension of the influence function
*\param weights case weights
*\param n number of observations
*\param codingx the coding of x, i.e. as.numeric(x)
*\param standstat the vector of the standardized statistics for x, y,
weights
*\param splitctrl an object of class `SplitControl'
*\param linexpcov2sample an (uninitialized) object of class
`LinStatExpectCovar' with p = 1
*\param expcovinf an initialized object of class `ExpectCovarInfluence'
*\param cutpoint return value; pointer to a double for the cutpoint in x
*\param levelset return value; pointer to a p-dim 0/1 integer
*\param maxstat return value; pointer to a double for the
maximal test statistic
*\param statistics return value; pointer to a n-dim double for
the statistics
*/
void C_splitcategorical(const int *codingx, int p,
const double *y, int q,
const double *weights, int n,
double *standstat,
SEXP splitctrl, SEXP linexpcov2sample,
SEXP expcovinf, double *cutpoint, int *levelset,
double *maxstat, double *statistics) {
double *tmpx, *tmptmpx, tmp = 0.0;
int *irank, *ordertmpx, i, j, k, l, jp, chk;
/* allocate memory */
tmpx = Calloc(n, double);
ordertmpx = Calloc(n, int);
irank = Calloc(p, int);
tmptmpx = Calloc(n, double);
/* for all response variables (aka: dummy variables) */
for (j = 0; j < q; j++) {
jp = j * p;
/* determine the ranking of the kth level among
the standardized statistic: This induced an ordering of the
observations */
for (k = 0; k < p; k++) {
irank[k] = 1;
for (l = 0; l < p; l++)
if (standstat[jp + l] < standstat[jp + k]) irank[k]++;
}
/* a temporary response variable: the rank of the level */
for (i = 0; i < n; i++) {
/* <FIXME> do we have to adjust weights for missing values here??? */
if (weights[i] == 0.0) {
tmpx[i] = 0.0;
} else {
tmpx[i] = (double) irank[codingx[i] - 1];
}
/* </FIXME> */
tmptmpx[i] = tmpx[i];
ordertmpx[i] = i + 1;
}
/* order(dtmpx) */
rsort_with_index(tmptmpx, ordertmpx, n);
/* search for a cutpoint (now we do have an ordering) */
C_split(tmpx, 1, y, q, weights, n, ordertmpx,
splitctrl, linexpcov2sample,
expcovinf, cutpoint, maxstat, statistics);
/* if we have seen an improvement: save this segmentation
note: there may be splits with equal goodness */
chk = 0;
if (maxstat[0] > tmp) {
for (k = 0; k < p; k++) {
if (irank[k] > cutpoint[0]) {
levelset[k] = 1;
chk += 1;
} else {
levelset[k] = 0;
}
}
tmp = maxstat[0];
}
/* <FIXME> make sure that at least one level goes left,
C_split may end up with cutpoint > max(irank), why?
</FIXME>
*/
/* hm, why did I added
if (chk == 0) tmp = 0.0;
??? */
}
maxstat[0] = tmp;
/* free memory */
Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx);
}
/**
R-interface to C_splitcategorical (does not handle ordered y's)
*\param x the values of the x-transformation
*\param codingx the coding of x, i.e. as.numeric(x)
*\param y values of the influence function
*\param weights case weights
*\param linexpcov2sample an (uninitialized) object of class
`LinStatExpectCovar' with p = 1
*\param linexpcov an initialized object of class `LinStatExpectCovar'
*\param expcovinf an initialized object of class `ExpectCovarInfluence'
*\param splitctrl an object of class `SplitControl'
*/
SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights,
SEXP linexpcov2sample, SEXP linexpcov,
SEXP expcovinf, SEXP splitctrl) {
SEXP ans, cutpoint, maxstat, statistics, levelset;
double *standstat;
C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
standstat = Calloc(get_dimension(linexpcov), double);
C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)),
REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
get_dimension(linexpcov), get_tol(splitctrl), standstat);
PROTECT(ans = allocVector(VECSXP, 4));
SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x)));
C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights),
nrow(x), standstat,
splitctrl, linexpcov2sample, expcovinf,
REAL(cutpoint), INTEGER(levelset), REAL(maxstat),
REAL(statistics));
UNPROTECT(1);
Free(standstat);
return(ans);
}