Diff of /partyMod/src/Splits.c [000000] .. [fbf06f]

Switch to side-by-side view

--- a
+++ b/partyMod/src/Splits.c
@@ -0,0 +1,345 @@
+
+/**
+    Binary splits 
+    *\file Splits.c
+    *\author $Author$
+    *\date $Date$
+*/
+                
+#include "party.h"
+
+
+/**
+    Search for a cutpoint in a ordered variable x maximizing a two-sample
+    statistic w.r.t. (the influence function of ) the response variable y.
+    *\param x raw numeric measurements 
+    *\param p dimension of the transformation
+    *\param y values of the influence function
+    *\param q dimension of the influence function
+    *\param weights case weights
+    *\param n number of observations
+    *\param orderx the ordering of the transformations, i.e. R> order(x)
+    *\param splitctrl an object of class `SplitControl'
+    *\param linexpcov2sample an (uninitialized) object of class 
+                             `LinStatExpectCovar' with p = 1
+    *\param expcovinf an initialized object of class `ExpectCovarInfluence'
+    *\param cutpoint return value; pointer to a double for the cutpoint in x
+    *\param maxstat return value; pointer to a double for the 
+                    maximal test statistic
+    *\param statistics return value; pointer to a n-dim double 
+                       for the statistics
+*/
+                                
+void C_split(const double *x, int p,
+             const double *y, int q,
+             const double *weights, int n,
+             const int *orderx,
+             SEXP splitctrl, SEXP linexpcov2sample, 
+             SEXP expcovinf, double *cutpoint, double *maxstat, 
+             double *statistics) {
+
+    double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar, 
+           tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp;
+    double minobs, maxobs, xmax;
+    int lastj, i, j, k;
+
+    if (p != 1) error("C_split: p not equal to one");
+    tol = get_tol(splitctrl);
+
+    /* init statistics and determine the maximal value with positive weight 
+       since we can't choose this one as cutpoint
+    */
+    xmax = 0.0;
+    for (i = 0; i < n; i++) {
+        statistics[i] = 0.0;
+        if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i];
+    }
+
+    /* we already have expecation and covariance of the response
+     * values and the sum of the weights */
+    dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym));
+    dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym));
+    sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0];
+
+    /* if there is something to split */
+    if (sweights > 1) {
+
+        /* we need to ensure that at least minbucket weights 
+           are there to split (either left or right) */
+        minprob = get_minprob(splitctrl);
+        minbucket = get_minbucket(splitctrl);
+        minobs = sweights * minprob + 1.0;
+
+        if (minobs < minbucket) 
+            minobs = minbucket; 
+        maxobs = sweights * (1 - minprob) - 1.0;
+        if (maxobs > sweights - minbucket) 
+            maxobs = sweights - minbucket; 
+
+        f1 = (double) sweights / (sweights - 1);
+        f2 = 1.0 / (sweights - 1);
+        w = 0.0;
+    
+        /* pointers to the R-objects */
+        dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym));
+        for (k = 0; k < q; k++) dlinstat[k] = 0.0;
+        dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym));
+        dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym));
+
+        tx = 0.0;
+        lastj = 0;
+
+        /* for all possible cutpoints (defined by the observations x) */
+        for (i = 0; i < (n - 1); i++) {
+    
+            /* the ordering of the ith observation */
+            j = orderx[i] - 1;
+        
+            /* if the corresponding weight is zero */
+            if (weights[j] == 0.0) continue;
+
+            /* just a check: can be removed later */
+            if (w > 0 && x[j] < tx)
+                warning("C_split: inconsistent ordering: %f < %f!\n", 
+                        x[j], tx);
+        
+            /* handle ties: delete the entry of the last visited observation
+               (take care of zero weights!) */
+            if (w > 0 && x[j] == tx)
+                statistics[lastj] = 0.0; 
+
+            /* store the value and position of the j smallest observation */
+            tx = x[j];
+            lastj = j;
+        
+            w += weights[j];
+
+            /* do not consider those splits */
+            if (w > maxobs || x[j] >= xmax) break;
+
+            /* compute the linear statistic and expectation and 
+             * covariance if needed */
+            for (k = 0; k < q; k++)
+                dlinstat[k] += y[n * k + j] * weights[j];
+ 
+            if (w >= minobs) {
+                for (k = 0; k < q; k++)
+                    dexpect[k] = w * dExp_y[k];
+
+                f1w = f1 * w;
+                f2ww = f2 * w * w;
+                for (k = 0; k < q*q; k++)
+                    dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k];
+            } else {
+                continue;
+            }
+        
+            /* the absolute standardized test statistic, to be maximized */
+            /* statistics[j] = C_maxabsTestStatistic(dlinstat, 
+                   dexpect, dcovar, q, tol); */
+
+            /* much faster but uses maxabs always*/
+            statistics[j] = 0.0;
+            for (k = 0; k < q; k++) {
+                if (dcovar[k * q + k] <= tol) continue;
+                tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]);
+                if (statistics[j] < tmp) statistics[j] = tmp;
+            }
+
+        }
+    
+        /* search for the maximum and the best separating cutpoint */
+        /* <FIXME> the result might differ between 32 and 64bit systems 
+                   because of rounding errors in 'statistics' */
+        maxstat[0] = 0.0;        
+        for (i = 0; i < n; i++) {
+            if (statistics[i] > maxstat[0]) {
+                maxstat[0] = statistics[i];
+                cutpoint[0] = x[i];
+            }
+        }
+        /* </FIXME> */
+    }
+}
+
+
+/**
+    R-interface to C_split (does not handle ordered y's)
+    *\param x values of the transformation
+    *\param y values of the influence function
+    *\param weights case weights
+    *\param orderx the ordering of the transformations, i.e. R> order(x)
+    *\param linexpcov2sample an (uninitialized) object of class 
+                             `LinStatExpectCovar' with p = 1
+    *\param expcovinf an initialized object of class `ExpectCovarInfluence'
+    *\param splitctrl an object of class `SplitControl'
+*/
+
+SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample, 
+             SEXP expcovinf, SEXP splitctrl) {
+             
+    SEXP ans, cutpoint, maxstat, statistics;
+    
+    PROTECT(ans = allocVector(VECSXP, 3));
+    SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
+    SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
+    SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
+    
+    C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
+            INTEGER(orderx), splitctrl, linexpcov2sample, expcovinf,
+            REAL(cutpoint), REAL(maxstat), REAL(statistics));
+    UNPROTECT(1);
+    return(ans);
+}
+
+
+/**
+    Search for a cutpoint in a unordered factor x maximizing a two-sample
+    statistic w.r.t. (the influence function of ) the response variable y.
+    *\param codingx the coding of x, i.e. as.numeric(x)
+    *\param p dimension of the transformation
+    *\param y values of the influence function
+    *\param q dimension of the influence function
+    *\param weights case weights
+    *\param n number of observations
+    *\param codingx the coding of x, i.e. as.numeric(x)
+    *\param standstat the vector of the standardized statistics for x, y, 
+                      weights 
+    *\param splitctrl an object of class `SplitControl'
+    *\param linexpcov2sample an (uninitialized) object of class 
+                             `LinStatExpectCovar' with p = 1
+    *\param expcovinf an initialized object of class `ExpectCovarInfluence'
+    *\param cutpoint return value; pointer to a double for the cutpoint in x
+    *\param levelset return value; pointer to a p-dim 0/1 integer
+    *\param maxstat return value; pointer to a double for the 
+                    maximal test statistic
+    *\param statistics return value; pointer to a n-dim double for 
+                       the statistics
+*/
+
+void C_splitcategorical(const int *codingx, int p,
+                        const double *y, int q,
+                        const double *weights, int n,
+                        double *standstat,
+                        SEXP splitctrl, SEXP linexpcov2sample, 
+                        SEXP expcovinf, double *cutpoint, int *levelset, 
+                        double *maxstat, double *statistics) {
+
+    double *tmpx, *tmptmpx, tmp = 0.0;
+    int *irank, *ordertmpx, i, j, k, l, jp, chk;
+
+    /* allocate memory */
+    tmpx = Calloc(n, double);
+    ordertmpx = Calloc(n, int);
+    irank = Calloc(p, int);
+    tmptmpx = Calloc(n, double);
+
+    /* for all response variables (aka: dummy variables) */
+    for (j = 0; j < q; j++) {
+    
+        jp = j * p;
+
+        /* determine the ranking of the kth level among 
+           the standardized statistic: This induced an ordering of the 
+           observations */
+        for (k = 0; k < p; k++) {
+            irank[k] = 1;
+            for (l = 0; l < p; l++)
+                if (standstat[jp + l] < standstat[jp + k]) irank[k]++;
+        }
+        
+        /* a temporary response variable: the rank of the level */
+        for (i = 0; i < n; i++) {
+            /* <FIXME> do we have to adjust weights for missing values here??? */
+            if (weights[i] == 0.0) {
+                tmpx[i] = 0.0;
+            } else {
+                tmpx[i] = (double) irank[codingx[i] - 1];
+            }
+            /* </FIXME> */
+            tmptmpx[i] = tmpx[i];
+            ordertmpx[i] = i + 1;
+        }
+        
+        /* order(dtmpx) */
+        rsort_with_index(tmptmpx, ordertmpx, n);
+
+        /* search for a cutpoint (now we do have an ordering) */
+        C_split(tmpx, 1, y, q, weights, n, ordertmpx,
+                splitctrl, linexpcov2sample,
+                expcovinf, cutpoint, maxstat, statistics);
+
+        /* if we have seen an improvement: save this segmentation 
+           note: there may be splits with equal goodness */
+        chk = 0;
+        if (maxstat[0] > tmp) {
+            for (k = 0; k < p; k++) {
+                if (irank[k] > cutpoint[0]) {
+                    levelset[k] = 1;
+                    chk += 1;
+                } else {
+                    levelset[k] = 0;
+                }
+            }
+            tmp = maxstat[0];
+        }
+        /* <FIXME> make sure that at least one level goes left,
+                   C_split may end up with cutpoint > max(irank), why?
+           </FIXME>
+        */
+        /* hm, why did I added 
+        if (chk == 0) tmp = 0.0; 
+        ??? */
+    }
+    maxstat[0] = tmp;
+
+    /* free memory */
+    Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx);
+}
+
+
+/**
+    R-interface to C_splitcategorical (does not handle ordered y's)
+    *\param x the values of the x-transformation
+    *\param codingx the coding of x, i.e. as.numeric(x)
+    *\param y values of the influence function
+    *\param weights case weights
+    *\param linexpcov2sample an (uninitialized) object of class 
+                             `LinStatExpectCovar' with p = 1
+    *\param linexpcov an initialized object of class `LinStatExpectCovar'
+    *\param expcovinf an initialized object of class `ExpectCovarInfluence'
+    *\param splitctrl an object of class `SplitControl'
+*/
+
+SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights, 
+                        SEXP linexpcov2sample, SEXP linexpcov, 
+                        SEXP expcovinf, SEXP splitctrl) {
+             
+    SEXP ans, cutpoint, maxstat, statistics, levelset;
+    double *standstat;
+
+    C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
+                    1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
+
+    standstat = Calloc(get_dimension(linexpcov), double);
+    C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)),
+                  REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
+                  REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
+                  get_dimension(linexpcov), get_tol(splitctrl), standstat);
+
+    PROTECT(ans = allocVector(VECSXP, 4));
+    SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
+    SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
+    SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
+    SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x)));
+    
+    C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights), 
+                       nrow(x), standstat, 
+                       splitctrl, linexpcov2sample, expcovinf, 
+                       REAL(cutpoint), INTEGER(levelset), REAL(maxstat), 
+                       REAL(statistics));
+
+    UNPROTECT(1);
+    Free(standstat);
+    return(ans);
+}