--- a +++ b/partyMod/src/Splits.c @@ -0,0 +1,345 @@ + +/** + Binary splits + *\file Splits.c + *\author $Author$ + *\date $Date$ +*/ + +#include "party.h" + + +/** + Search for a cutpoint in a ordered variable x maximizing a two-sample + statistic w.r.t. (the influence function of ) the response variable y. + *\param x raw numeric measurements + *\param p dimension of the transformation + *\param y values of the influence function + *\param q dimension of the influence function + *\param weights case weights + *\param n number of observations + *\param orderx the ordering of the transformations, i.e. R> order(x) + *\param splitctrl an object of class `SplitControl' + *\param linexpcov2sample an (uninitialized) object of class + `LinStatExpectCovar' with p = 1 + *\param expcovinf an initialized object of class `ExpectCovarInfluence' + *\param cutpoint return value; pointer to a double for the cutpoint in x + *\param maxstat return value; pointer to a double for the + maximal test statistic + *\param statistics return value; pointer to a n-dim double + for the statistics +*/ + +void C_split(const double *x, int p, + const double *y, int q, + const double *weights, int n, + const int *orderx, + SEXP splitctrl, SEXP linexpcov2sample, + SEXP expcovinf, double *cutpoint, double *maxstat, + double *statistics) { + + double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar, + tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp; + double minobs, maxobs, xmax; + int lastj, i, j, k; + + if (p != 1) error("C_split: p not equal to one"); + tol = get_tol(splitctrl); + + /* init statistics and determine the maximal value with positive weight + since we can't choose this one as cutpoint + */ + xmax = 0.0; + for (i = 0; i < n; i++) { + statistics[i] = 0.0; + if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i]; + } + + /* we already have expecation and covariance of the response + * values and the sum of the weights */ + dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym)); + dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym)); + sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0]; + + /* if there is something to split */ + if (sweights > 1) { + + /* we need to ensure that at least minbucket weights + are there to split (either left or right) */ + minprob = get_minprob(splitctrl); + minbucket = get_minbucket(splitctrl); + minobs = sweights * minprob + 1.0; + + if (minobs < minbucket) + minobs = minbucket; + maxobs = sweights * (1 - minprob) - 1.0; + if (maxobs > sweights - minbucket) + maxobs = sweights - minbucket; + + f1 = (double) sweights / (sweights - 1); + f2 = 1.0 / (sweights - 1); + w = 0.0; + + /* pointers to the R-objects */ + dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym)); + for (k = 0; k < q; k++) dlinstat[k] = 0.0; + dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym)); + dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym)); + + tx = 0.0; + lastj = 0; + + /* for all possible cutpoints (defined by the observations x) */ + for (i = 0; i < (n - 1); i++) { + + /* the ordering of the ith observation */ + j = orderx[i] - 1; + + /* if the corresponding weight is zero */ + if (weights[j] == 0.0) continue; + + /* just a check: can be removed later */ + if (w > 0 && x[j] < tx) + warning("C_split: inconsistent ordering: %f < %f!\n", + x[j], tx); + + /* handle ties: delete the entry of the last visited observation + (take care of zero weights!) */ + if (w > 0 && x[j] == tx) + statistics[lastj] = 0.0; + + /* store the value and position of the j smallest observation */ + tx = x[j]; + lastj = j; + + w += weights[j]; + + /* do not consider those splits */ + if (w > maxobs || x[j] >= xmax) break; + + /* compute the linear statistic and expectation and + * covariance if needed */ + for (k = 0; k < q; k++) + dlinstat[k] += y[n * k + j] * weights[j]; + + if (w >= minobs) { + for (k = 0; k < q; k++) + dexpect[k] = w * dExp_y[k]; + + f1w = f1 * w; + f2ww = f2 * w * w; + for (k = 0; k < q*q; k++) + dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k]; + } else { + continue; + } + + /* the absolute standardized test statistic, to be maximized */ + /* statistics[j] = C_maxabsTestStatistic(dlinstat, + dexpect, dcovar, q, tol); */ + + /* much faster but uses maxabs always*/ + statistics[j] = 0.0; + for (k = 0; k < q; k++) { + if (dcovar[k * q + k] <= tol) continue; + tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]); + if (statistics[j] < tmp) statistics[j] = tmp; + } + + } + + /* search for the maximum and the best separating cutpoint */ + /* <FIXME> the result might differ between 32 and 64bit systems + because of rounding errors in 'statistics' */ + maxstat[0] = 0.0; + for (i = 0; i < n; i++) { + if (statistics[i] > maxstat[0]) { + maxstat[0] = statistics[i]; + cutpoint[0] = x[i]; + } + } + /* </FIXME> */ + } +} + + +/** + R-interface to C_split (does not handle ordered y's) + *\param x values of the transformation + *\param y values of the influence function + *\param weights case weights + *\param orderx the ordering of the transformations, i.e. R> order(x) + *\param linexpcov2sample an (uninitialized) object of class + `LinStatExpectCovar' with p = 1 + *\param expcovinf an initialized object of class `ExpectCovarInfluence' + *\param splitctrl an object of class `SplitControl' +*/ + +SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample, + SEXP expcovinf, SEXP splitctrl) { + + SEXP ans, cutpoint, maxstat, statistics; + + PROTECT(ans = allocVector(VECSXP, 3)); + SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1)); + SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1)); + SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x))); + + C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x), + INTEGER(orderx), splitctrl, linexpcov2sample, expcovinf, + REAL(cutpoint), REAL(maxstat), REAL(statistics)); + UNPROTECT(1); + return(ans); +} + + +/** + Search for a cutpoint in a unordered factor x maximizing a two-sample + statistic w.r.t. (the influence function of ) the response variable y. + *\param codingx the coding of x, i.e. as.numeric(x) + *\param p dimension of the transformation + *\param y values of the influence function + *\param q dimension of the influence function + *\param weights case weights + *\param n number of observations + *\param codingx the coding of x, i.e. as.numeric(x) + *\param standstat the vector of the standardized statistics for x, y, + weights + *\param splitctrl an object of class `SplitControl' + *\param linexpcov2sample an (uninitialized) object of class + `LinStatExpectCovar' with p = 1 + *\param expcovinf an initialized object of class `ExpectCovarInfluence' + *\param cutpoint return value; pointer to a double for the cutpoint in x + *\param levelset return value; pointer to a p-dim 0/1 integer + *\param maxstat return value; pointer to a double for the + maximal test statistic + *\param statistics return value; pointer to a n-dim double for + the statistics +*/ + +void C_splitcategorical(const int *codingx, int p, + const double *y, int q, + const double *weights, int n, + double *standstat, + SEXP splitctrl, SEXP linexpcov2sample, + SEXP expcovinf, double *cutpoint, int *levelset, + double *maxstat, double *statistics) { + + double *tmpx, *tmptmpx, tmp = 0.0; + int *irank, *ordertmpx, i, j, k, l, jp, chk; + + /* allocate memory */ + tmpx = Calloc(n, double); + ordertmpx = Calloc(n, int); + irank = Calloc(p, int); + tmptmpx = Calloc(n, double); + + /* for all response variables (aka: dummy variables) */ + for (j = 0; j < q; j++) { + + jp = j * p; + + /* determine the ranking of the kth level among + the standardized statistic: This induced an ordering of the + observations */ + for (k = 0; k < p; k++) { + irank[k] = 1; + for (l = 0; l < p; l++) + if (standstat[jp + l] < standstat[jp + k]) irank[k]++; + } + + /* a temporary response variable: the rank of the level */ + for (i = 0; i < n; i++) { + /* <FIXME> do we have to adjust weights for missing values here??? */ + if (weights[i] == 0.0) { + tmpx[i] = 0.0; + } else { + tmpx[i] = (double) irank[codingx[i] - 1]; + } + /* </FIXME> */ + tmptmpx[i] = tmpx[i]; + ordertmpx[i] = i + 1; + } + + /* order(dtmpx) */ + rsort_with_index(tmptmpx, ordertmpx, n); + + /* search for a cutpoint (now we do have an ordering) */ + C_split(tmpx, 1, y, q, weights, n, ordertmpx, + splitctrl, linexpcov2sample, + expcovinf, cutpoint, maxstat, statistics); + + /* if we have seen an improvement: save this segmentation + note: there may be splits with equal goodness */ + chk = 0; + if (maxstat[0] > tmp) { + for (k = 0; k < p; k++) { + if (irank[k] > cutpoint[0]) { + levelset[k] = 1; + chk += 1; + } else { + levelset[k] = 0; + } + } + tmp = maxstat[0]; + } + /* <FIXME> make sure that at least one level goes left, + C_split may end up with cutpoint > max(irank), why? + </FIXME> + */ + /* hm, why did I added + if (chk == 0) tmp = 0.0; + ??? */ + } + maxstat[0] = tmp; + + /* free memory */ + Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx); +} + + +/** + R-interface to C_splitcategorical (does not handle ordered y's) + *\param x the values of the x-transformation + *\param codingx the coding of x, i.e. as.numeric(x) + *\param y values of the influence function + *\param weights case weights + *\param linexpcov2sample an (uninitialized) object of class + `LinStatExpectCovar' with p = 1 + *\param linexpcov an initialized object of class `LinStatExpectCovar' + *\param expcovinf an initialized object of class `ExpectCovarInfluence' + *\param splitctrl an object of class `SplitControl' +*/ + +SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights, + SEXP linexpcov2sample, SEXP linexpcov, + SEXP expcovinf, SEXP splitctrl) { + + SEXP ans, cutpoint, maxstat, statistics, levelset; + double *standstat; + + C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x), + 1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov); + + standstat = Calloc(get_dimension(linexpcov), double); + C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)), + REAL(GET_SLOT(linexpcov, PL2_expectationSym)), + REAL(GET_SLOT(linexpcov, PL2_covarianceSym)), + get_dimension(linexpcov), get_tol(splitctrl), standstat); + + PROTECT(ans = allocVector(VECSXP, 4)); + SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1)); + SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1)); + SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x))); + SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x))); + + C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights), + nrow(x), standstat, + splitctrl, linexpcov2sample, expcovinf, + REAL(cutpoint), INTEGER(levelset), REAL(maxstat), + REAL(statistics)); + + UNPROTECT(1); + Free(standstat); + return(ans); +}