Main Page | Directories | File List | File Members | Related Pages

Splits.c

Go to the documentation of this file.
00001 
00009 #include "party.h"
00010 
00011 
00035 void C_split(const double *x, int p,
00036              const double *y, int q,
00037              const double *weights, int n,
00038              const int *orderx, const double *score_y,
00039              const int ORDERED, SEXP splitctrl, SEXP linexpcov2sample, 
00040              SEXP expcovinf, double *cutpoint, double *maxstat, 
00041              double *statistics) {
00042 
00043     double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar, 
00044            tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp;
00045     double minobs, maxobs, xmax;
00046     int lastj, i, j, k, l;
00047 
00048     if (p != 1) error("C_split: p not equal to one");
00049     tol = get_tol(splitctrl);
00050 
00051     /* init statistics and determine the maximal value with positive weight 
00052        since we can't choose this one as cutpoint
00053     */
00054     xmax = 0.0;
00055     for (i = 0; i < n; i++) {
00056         statistics[i] = 0.0;
00057         if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i];
00058     }
00059 
00060     /* we already have expecation and covariance of the response
00061      * values and the sum of the weights */
00062     dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym));
00063     dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym));
00064     sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0];
00065 
00066     /* if there is something to split */
00067     if (sweights > 1) {
00068 
00069         /* we need to ensure that at least minbucket weights 
00070            are there to split (either left or right) */
00071         minprob = get_minprob(splitctrl);
00072         minbucket = get_minbucket(splitctrl);
00073         minobs = sweights * minprob + 1.0;
00074 
00075         if (minobs < minbucket) 
00076             minobs = minbucket; 
00077         maxobs = sweights * (1 - minprob) - 1.0;
00078         if (maxobs > sweights - minbucket) 
00079             maxobs = sweights - minbucket; 
00080 
00081         f1 = (double) sweights / (sweights - 1);
00082         f2 = 1.0 / (sweights - 1);
00083         w = 0.0;
00084     
00085         /* pointers to the R-objects */
00086         dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym));
00087         for (k = 0; k < q; k++) dlinstat[k] = 0.0;
00088         dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym));
00089         dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym));
00090 
00091         tx = 0.0;
00092         lastj = 0;
00093 
00094         /* for all possible cutpoints (defined by the observations x) */
00095         for (i = 0; i < (n - 1); i++) {
00096     
00097             /* the ordering of the ith observation */
00098             j = orderx[i] - 1;
00099         
00100             /* if the corresponding weight is zero */
00101             if (weights[j] == 0.0) continue;
00102 
00103             /* just a check: can be removed later */
00104             if (w > 0 && x[j] < tx)
00105                 warning("C_split: inconsistent ordering: %f < %f!\n", 
00106                         x[j], tx);
00107         
00108             /* handle ties: delete the entry of the last visited observation
00109                (take care of zero weights!) */
00110             if (w > 0 && x[j] == tx)
00111                 statistics[lastj] = 0.0; 
00112 
00113             /* store the value and position of the j smallest observation */
00114             tx = x[j];
00115             lastj = j;
00116         
00117             w += weights[j];
00118 
00119             /* do not consider those splits */
00120             if (w >= maxobs || x[j] >= xmax) break;
00121 
00122             /* compute the linear statistic and expectation and 
00123              * covariance if needed */
00124             if (ORDERED) {
00125                 for (k = 0; k < q; k++)
00126                     dlinstat[0] += score_y[k] * y[n * k + j] * weights[j];
00127 
00128                 /* do not consider those splits */
00129                 if (w > minobs) {
00130                     dexpect[0] = 0.0;
00131                     for (k = 0; k < q; k++) {
00132                         dexpect[0] += score_y[k] * w * dExp_y[k];
00133                     }
00134                     dcovar[0] = 0.0;
00135                     f1w = f1 * w;
00136                     f2ww = f2 * w * w;
00137                     for (k = 0; k < q; k++) {
00138                         for (l = 0; l < q; l++) {
00139                             dcovar[0] += score_y[k] * 
00140                             (f1w * dCov_y[k*q + l] - f2ww * dCov_y[k*q + l]) * 
00141                             score_y[l];
00142                         }
00143                     }
00144                 } else {
00145                     continue;
00146                 }
00147             } else {
00148                 for (k = 0; k < q; k++)
00149                     dlinstat[k] += y[n * k + j] * weights[j];
00150  
00151                 if (w > minobs) {
00152                     for (k = 0; k < q; k++)
00153                         dexpect[k] = w * dExp_y[k];
00154 
00155                     f1w = f1 * w;
00156                     f2ww = f2 * w * w;
00157                     for (k = 0; k < q*q; k++)
00158                         dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k];
00159                 } else {
00160                     continue;
00161                 }
00162             }
00163         
00164             /* the absolute standardized test statistic, to be maximized */
00165             /* statistics[j] = C_maxabsTestStatistic(dlinstat, 
00166                    dexpect, dcovar, q, tol); */
00167 
00168             /* much faster but uses maxabs always*/
00169             statistics[j] = 0.0;
00170             for (k = 0; k < q; k++) {
00171                 if (dcovar[k * q + k] <= tol) continue;
00172                 tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]);
00173                 if (statistics[j] < tmp) statistics[j] = tmp;
00174             }
00175 
00176         }
00177     
00178         /* search for the maximum and the best separating cutpoint */
00179         maxstat[0] = 0.0;        
00180         for (i = 0; i < (n - 1); i++) {
00181             if (statistics[i] > maxstat[0]) {
00182                 maxstat[0] = statistics[i];
00183                 cutpoint[0] = x[i];
00184             }
00185         }
00186     }
00187 }
00188 
00189 
00202 SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample, 
00203              SEXP expcovinf, SEXP splitctrl) {
00204              
00205     SEXP ans, cutpoint, maxstat, statistics;
00206     
00207     PROTECT(ans = allocVector(VECSXP, 3));
00208     SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00209     SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00210     SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00211     
00212     C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00213             INTEGER(orderx), NULL, 0, splitctrl, linexpcov2sample, expcovinf,
00214             REAL(cutpoint), REAL(maxstat), REAL(statistics));
00215     UNPROTECT(1);
00216     return(ans);
00217 }
00218 
00219 
00246 void C_splitcategorical(const int *codingx, int p,
00247                         const double *y, int q,
00248                         const double *weights, int n,
00249                         const double *score_y, const int ORDERED, 
00250                         double *standstat,
00251                         SEXP splitctrl, SEXP linexpcov2sample, 
00252                         SEXP expcovinf, double *cutpoint, int *levelset, 
00253                         double *maxstat, double *statistics) {
00254 
00255     double tol, *tmpx, *tmptmpx, tmp = 0.0;
00256     int *irank, *ordertmpx, i, j, k, l, jp, chk;
00257 
00258     tol = get_tol(splitctrl);          
00259                      
00260     /* allocate memory */
00261     tmpx = Calloc(n, double);
00262     ordertmpx = Calloc(n, int);
00263     irank = Calloc(p, int);
00264     tmptmpx = Calloc(n, double);
00265 
00266     if (ORDERED) q = 1;
00267     
00268     /* for all response variables (aka: dummy variables) */
00269     for (j = 0; j < q; j++) {
00270     
00271         jp = j * p;
00272 
00273         /* determine the ranking of the kth level among 
00274            the standardized statistic: This induced an ordering of the 
00275            observations */
00276         for (k = 0; k < p; k++) {
00277             irank[k] = 1;
00278             for (l = 0; l < p; l++)
00279                 if (standstat[jp + l] < standstat[jp + k]) irank[k]++;
00280         }
00281         
00282         /* a temporary response variable: the rank of the level */
00283         for (i = 0; i < n; i++) {
00284             tmpx[i] = (double) irank[codingx[i] - 1];
00285             tmptmpx[i] = tmpx[i];
00286             ordertmpx[i] = i + 1;
00287         }
00288         
00289         /* order(dtmpx) */
00290         rsort_with_index(tmptmpx, ordertmpx, n);
00291 
00292         /* search for a cutpoint (now we do have an ordering) */
00293         C_split(tmpx, 1, y, q, weights, n, ordertmpx, score_y,
00294                 ORDERED, splitctrl, linexpcov2sample,
00295                 expcovinf, cutpoint, maxstat, statistics);
00296 
00297         /* if we have seen an improvement: save this segmentation 
00298            note: there may be splits with equal goodness */
00299         chk = 0;
00300         if (maxstat[0] > tmp) {
00301             for (k = 0; k < p; k++) {
00302                 if (irank[k] > cutpoint[0]) {
00303                     levelset[k] = 1;
00304                     chk += 1;
00305                 } else {
00306                     levelset[k] = 0;
00307                 }
00308             }
00309             tmp = maxstat[0];
00310         }
00311         /* <FIXME> make sure that at least one level goes left,
00312                    C_split may end up with cutpoint > max(irank), why?
00313            </FIXME>
00314         */
00315         /* hm, why did I added 
00316         if (chk == 0) tmp = 0.0; 
00317         ??? */
00318     }
00319     maxstat[0] = tmp;
00320 
00321     /* free memory */
00322     Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx);
00323 }
00324 
00325 
00339 SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights, 
00340                         SEXP linexpcov2sample, SEXP linexpcov, 
00341                         SEXP expcovinf, SEXP splitctrl) {
00342              
00343     SEXP ans, cutpoint, maxstat, statistics, levelset;
00344     double *standstat;
00345 
00346     C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00347                     1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
00348 
00349     standstat = Calloc(get_dimension(linexpcov), double);
00350     C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)),
00351                   REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
00352                   REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
00353                   get_dimension(linexpcov), get_tol(splitctrl), standstat);
00354 
00355     PROTECT(ans = allocVector(VECSXP, 4));
00356     SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00357     SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00358     SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00359     SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x)));
00360     
00361     C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights), 
00362                        nrow(x), NULL, 0, standstat, 
00363                        splitctrl, linexpcov2sample, expcovinf, 
00364                        REAL(cutpoint), INTEGER(levelset), REAL(maxstat), 
00365                        REAL(statistics));
00366 
00367     UNPROTECT(1);
00368     Free(standstat);
00369     return(ans);
00370 }

Generated on Tue Apr 4 13:20:29 2006 for party by  doxygen 1.4.2