00001
00009 #include "party.h"
00010
00011
00035 void C_split(const double *x, int p,
00036 const double *y, int q,
00037 const double *weights, int n,
00038 const int *orderx, const double *score_y,
00039 const int ORDERED, SEXP splitctrl, SEXP linexpcov2sample,
00040 SEXP expcovinf, double *cutpoint, double *maxstat,
00041 double *statistics) {
00042
00043 double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar,
00044 tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp;
00045 double minobs, maxobs, xmax;
00046 int lastj, i, j, k, l;
00047
00048 if (p != 1) error("C_split: p not equal to one");
00049 tol = get_tol(splitctrl);
00050
00051
00052
00053
00054 xmax = 0.0;
00055 for (i = 0; i < n; i++) {
00056 statistics[i] = 0.0;
00057 if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i];
00058 }
00059
00060
00061
00062 dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym));
00063 dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym));
00064 sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0];
00065
00066
00067 if (sweights > 1) {
00068
00069
00070
00071 minprob = get_minprob(splitctrl);
00072 minbucket = get_minbucket(splitctrl);
00073 minobs = sweights * minprob + 1.0;
00074
00075 if (minobs < minbucket)
00076 minobs = minbucket;
00077 maxobs = sweights * (1 - minprob) - 1.0;
00078 if (maxobs > sweights - minbucket)
00079 maxobs = sweights - minbucket;
00080
00081 f1 = (double) sweights / (sweights - 1);
00082 f2 = 1.0 / (sweights - 1);
00083 w = 0.0;
00084
00085
00086 dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym));
00087 for (k = 0; k < q; k++) dlinstat[k] = 0.0;
00088 dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym));
00089 dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym));
00090
00091 tx = 0.0;
00092 lastj = 0;
00093
00094
00095 for (i = 0; i < (n - 1); i++) {
00096
00097
00098 j = orderx[i] - 1;
00099
00100
00101 if (weights[j] == 0.0) continue;
00102
00103
00104 if (w > 0 && x[j] < tx)
00105 warning("C_split: inconsistent ordering: %f < %f!\n",
00106 x[j], tx);
00107
00108
00109
00110 if (w > 0 && x[j] == tx)
00111 statistics[lastj] = 0.0;
00112
00113
00114 tx = x[j];
00115 lastj = j;
00116
00117 w += weights[j];
00118
00119
00120 if (w >= maxobs || x[j] >= xmax) break;
00121
00122
00123
00124 if (ORDERED) {
00125 for (k = 0; k < q; k++)
00126 dlinstat[0] += score_y[k] * y[n * k + j] * weights[j];
00127
00128
00129 if (w > minobs) {
00130 dexpect[0] = 0.0;
00131 for (k = 0; k < q; k++) {
00132 dexpect[0] += score_y[k] * w * dExp_y[k];
00133 }
00134 dcovar[0] = 0.0;
00135 f1w = f1 * w;
00136 f2ww = f2 * w * w;
00137 for (k = 0; k < q; k++) {
00138 for (l = 0; l < q; l++) {
00139 dcovar[0] += score_y[k] *
00140 (f1w * dCov_y[k*q + l] - f2ww * dCov_y[k*q + l]) *
00141 score_y[l];
00142 }
00143 }
00144 } else {
00145 continue;
00146 }
00147 } else {
00148 for (k = 0; k < q; k++)
00149 dlinstat[k] += y[n * k + j] * weights[j];
00150
00151 if (w > minobs) {
00152 for (k = 0; k < q; k++)
00153 dexpect[k] = w * dExp_y[k];
00154
00155 f1w = f1 * w;
00156 f2ww = f2 * w * w;
00157 for (k = 0; k < q*q; k++)
00158 dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k];
00159 } else {
00160 continue;
00161 }
00162 }
00163
00164
00165
00166
00167
00168
00169 statistics[j] = 0.0;
00170 for (k = 0; k < q; k++) {
00171 if (dcovar[k * q + k] <= tol) continue;
00172 tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]);
00173 if (statistics[j] < tmp) statistics[j] = tmp;
00174 }
00175
00176 }
00177
00178
00179 maxstat[0] = 0.0;
00180 for (i = 0; i < (n - 1); i++) {
00181 if (statistics[i] > maxstat[0]) {
00182 maxstat[0] = statistics[i];
00183 cutpoint[0] = x[i];
00184 }
00185 }
00186 }
00187 }
00188
00189
00202 SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample,
00203 SEXP expcovinf, SEXP splitctrl) {
00204
00205 SEXP ans, cutpoint, maxstat, statistics;
00206
00207 PROTECT(ans = allocVector(VECSXP, 3));
00208 SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00209 SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00210 SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00211
00212 C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00213 INTEGER(orderx), NULL, 0, splitctrl, linexpcov2sample, expcovinf,
00214 REAL(cutpoint), REAL(maxstat), REAL(statistics));
00215 UNPROTECT(1);
00216 return(ans);
00217 }
00218
00219
00246 void C_splitcategorical(const int *codingx, int p,
00247 const double *y, int q,
00248 const double *weights, int n,
00249 const double *score_y, const int ORDERED,
00250 double *standstat,
00251 SEXP splitctrl, SEXP linexpcov2sample,
00252 SEXP expcovinf, double *cutpoint, int *levelset,
00253 double *maxstat, double *statistics) {
00254
00255 double tol, *tmpx, *tmptmpx, tmp = 0.0;
00256 int *irank, *ordertmpx, i, j, k, l, jp, chk;
00257
00258 tol = get_tol(splitctrl);
00259
00260
00261 tmpx = Calloc(n, double);
00262 ordertmpx = Calloc(n, int);
00263 irank = Calloc(p, int);
00264 tmptmpx = Calloc(n, double);
00265
00266 if (ORDERED) q = 1;
00267
00268
00269 for (j = 0; j < q; j++) {
00270
00271 jp = j * p;
00272
00273
00274
00275
00276 for (k = 0; k < p; k++) {
00277 irank[k] = 1;
00278 for (l = 0; l < p; l++)
00279 if (standstat[jp + l] < standstat[jp + k]) irank[k]++;
00280 }
00281
00282
00283 for (i = 0; i < n; i++) {
00284 tmpx[i] = (double) irank[codingx[i] - 1];
00285 tmptmpx[i] = tmpx[i];
00286 ordertmpx[i] = i + 1;
00287 }
00288
00289
00290 rsort_with_index(tmptmpx, ordertmpx, n);
00291
00292
00293 C_split(tmpx, 1, y, q, weights, n, ordertmpx, score_y,
00294 ORDERED, splitctrl, linexpcov2sample,
00295 expcovinf, cutpoint, maxstat, statistics);
00296
00297
00298
00299 chk = 0;
00300 if (maxstat[0] > tmp) {
00301 for (k = 0; k < p; k++) {
00302 if (irank[k] > cutpoint[0]) {
00303 levelset[k] = 1;
00304 chk += 1;
00305 } else {
00306 levelset[k] = 0;
00307 }
00308 }
00309 tmp = maxstat[0];
00310 }
00311
00312
00313
00314
00315
00316
00317
00318 }
00319 maxstat[0] = tmp;
00320
00321
00322 Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx);
00323 }
00324
00325
00339 SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights,
00340 SEXP linexpcov2sample, SEXP linexpcov,
00341 SEXP expcovinf, SEXP splitctrl) {
00342
00343 SEXP ans, cutpoint, maxstat, statistics, levelset;
00344 double *standstat;
00345
00346 C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00347 1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
00348
00349 standstat = Calloc(get_dimension(linexpcov), double);
00350 C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)),
00351 REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
00352 REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
00353 get_dimension(linexpcov), get_tol(splitctrl), standstat);
00354
00355 PROTECT(ans = allocVector(VECSXP, 4));
00356 SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00357 SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00358 SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00359 SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x)));
00360
00361 C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights),
00362 nrow(x), NULL, 0, standstat,
00363 splitctrl, linexpcov2sample, expcovinf,
00364 REAL(cutpoint), INTEGER(levelset), REAL(maxstat),
00365 REAL(statistics));
00366
00367 UNPROTECT(1);
00368 Free(standstat);
00369 return(ans);
00370 }