The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
#' Cost function for Logistic regression, i.e. binomial family in GLM.
#'
#' @param data Data to be used to calculate the cost values. The last column is
#' the response variable.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return Cost value for the corresponding segment of data.
cost_glm_binomial <- function(data, family = "binomial") {
data <- as.matrix(data)
p <- dim(data)[2] - 1
out <- fastglm::fastglm(
as.matrix(data[, 1:p]), data[, p + 1],
family = family
)
return(out$deviance / 2)
}
#' Implementation of vanilla PELT for logistic regression type data.
#'
#' @param data Data to be used for change point detection.
#' @param beta Penalty coefficient for the number of change points.
#' @param cost Cost function to be used to calculate cost values.
#' @keywords internal
#'
#' @noRd
#' @return A list consisting of two: change point locations and negative log
#' likelihood values for each segment.
pelt_vanilla_binomial <- function(data, beta, cost = cost_glm_binomial) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:m)
{
k <- set[i] + 1
cval[i] <- 0
if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ]))
}
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
Fobj <- c(Fobj, min_val)
}
cp <- cp_set[[n + 1]]
nLL <- 0
cp_loc <- unique(c(0, cp, n))
for (i in 1:(length(cp_loc) - 1))
{
seg <- (cp_loc[i] + 1):cp_loc[i + 1]
data_seg <- data[seg, ]
out <- fastglm::fastglm(
as.matrix(data_seg[, 1:p]), data_seg[, p + 1],
family = "binomial"
)
nLL <- out$deviance / 2 + nLL
}
output <- list(cp, nLL)
names(output) <- c("cp", "nLL")
return(output)
}
#' Function to update the coefficients using gradient descent.
#'
#' @param data_new New data point used to update the coeffient.
#' @param coef Previous coeffient to be updated.
#' @param cum_coef Summation of all the past coefficients to be used in
#' averaging.
#' @param cmatrix Hessian matrix in gradient descent.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#' the Hessian matrix.
#' @keywords internal
#'
#' @noRd
#' @return A list of values containing the new coefficients, summation of
#' coefficients so far and all the Hessian matrices.
cost_logistic_update <- function(
data_new, coef, cum_coef, cmatrix, epsilon = 1e-10) {
p <- length(data_new) - 1
X_new <- data_new[1:p]
Y_new <- data_new[p + 1]
eta <- X_new %*% coef
mu <- 1 / (1 + exp(-eta))
cmatrix <- cmatrix + (X_new %o% X_new) * as.numeric((1 - mu) * mu)
lik_dev <- as.numeric(-(Y_new - mu)) * X_new
coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev)
cum_coef <- cum_coef + coef
return(list(coef, cum_coef, cmatrix))
}
#' Calculate negative log likelihood given data segment and guess of
#' coefficient.
#'
#' @param data Data to be used to calculate the negative log likelihood.
#' @param b Guess of the coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Negative log likelihood.
neg_log_lik_binomial <- function(data, b) {
p <- dim(data)[2] - 1
X <- data[, 1:p, drop = FALSE]
Y <- data[, p + 1, drop = FALSE]
u <- as.numeric(X %*% b)
L <- -Y * u + log(1 + exp(u))
return(sum(L))
}
#' Find change points using dynamic programming with pruning and SeGD.
#'
#' @param data Data used to find change points.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param trim Propotion of the data to ignore the change points at the
#' beginning, ending and between change points.
#' @keywords internal
#'
#' @noRd
#' @return A list containing potential change point locations and negative log
#' likelihood for each segment based on the change points guess.
segd_binomial <- function(data, beta, B = 10, trim = 0.025) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
# choose the initial values based on pre-segmentation
index <- rep(1:B, rep(n / B, B))
coef.int <- matrix(NA, B, p)
for (i in 1:B)
{
out <- fastglm::fastglm(
as.matrix(data[index == i, 1:p]),
data[index == i, p + 1],
family = "binomial"
)
coef.int[i, ] <- coef(out)
}
X1 <- data[1, 1:p]
cum_coef <- coef <- matrix(coef.int[1, ], p, 1)
e_eta <- exp(coef %*% X1)
const <- e_eta / (1 + e_eta)^2
cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1))
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:(m - 1))
{
coef_c <- coef[, i]
cum_coef_c <- cum_coef[, i]
cmatrix_c <- cmatrix[, , i]
out <- cost_logistic_update(data[t, ], coef_c, cum_coef_c, cmatrix_c)
coef[, i] <- out[[1]]
cum_coef[, i] <- out[[2]]
cmatrix[, , i] <- out[[3]]
k <- set[i] + 1
cval[i] <- 0
if (t - k >= p - 1) {
cval[i] <-
neg_log_lik_binomial(data[k:t, ], cum_coef[, i] / (t - k + 1))
}
}
# the choice of initial values requires further investigation
cval[m] <- 0
Xt <- data[t, 1:p]
cum_coef_add <- coef_add <- coef.int[index[t], ]
e_eta_t <- exp(coef_add %*% Xt)
const <- e_eta_t / (1 + e_eta_t)^2
cmatrix_add <- (Xt %o% Xt) * as.numeric(const)
coef <- cbind(coef, coef_add)
cum_coef <- cbind(cum_coef, cum_coef_add)
cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3)
# Adding a momentum term (TBD)
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
coef <- coef[, ind2, drop = FALSE]
cum_coef <- cum_coef[, ind2, drop = FALSE]
cmatrix <- cmatrix[, , ind2, drop = FALSE]
Fobj <- c(Fobj, min_val)
}
# Remove change-points close to the boundaries
cp <- cp_set[[n + 1]]
if (length(cp) > 0) {
ind3 <- (seq_len(length(cp)))[(cp < trim * n) | (cp > (1 - trim) * n)]
cp <- cp[-ind3]
}
nLL <- 0
cp_loc <- unique(c(0, cp, n))
for (i in 1:(length(cp_loc) - 1))
{
seg <- (cp_loc[i] + 1):cp_loc[i + 1]
data_seg <- data[seg, ]
out <- fastglm::fastglm(
as.matrix(data_seg[, 1:p]), data_seg[, p + 1],
family = "binomial"
)
nLL <- out$deviance / 2 + nLL
}
output <- list(cp, nLL)
names(output) <- c("cp", "nLL")
return(output)
}
#' Cost function for Poisson regression.
#'
#' @param data Data to be used to calculate the cost values. The last column is
#' the response variable.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return Cost value for the corresponding segment of data.
cost_glm_poisson <- function(data, family = "poisson") {
data <- as.matrix(data)
p <- dim(data)[2] - 1
out <- fastglm::fastglm(as.matrix(data[, 1:p]), data[, p + 1], family = family)
return(out$deviance / 2)
}
#' Implementation of vanilla PELT for poisson regression type data.
#'
#' @param data Data to be used for change point detection.
#' @param beta Penalty coefficient for the number of change points.
#' @param cost Cost function to be used to calculate cost values.
#' @keywords internal
#'
#' @noRd
#' @return A list consisting of two: change point locations and negative log
#' likelihood values for each segment.
pelt_vanilla_poisson <- function(data, beta, cost = cost_glm_poisson) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:m)
{
k <- set[i] + 1
if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) else cval[i] <- 0
}
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
Fobj <- c(Fobj, min_val)
# if (t %% 100 == 0) print(t)
}
cp <- cp_set[[n + 1]]
output <- list(cp)
names(output) <- c("cp")
return(output)
}
#' Function to update the coefficients using gradient descent.
#'
#' @param data_new New data point used to update the coeffient.
#' @param coef Previous coeffient to be updated.
#' @param cum_coef Summation of all the past coefficients to be used in
#' averaging.
#' @param cmatrix Hessian matrix in gradient descent.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#' the Hessian matrix.
#' @param G Upper bound for the coefficient.
#' @param L Winsorization lower bound.
#' @param H Winsorization upper bound.
#' @keywords internal
#'
#' @noRd
#' @return A list of values containing the new coefficients, summation of
#' coefficients so far and all the Hessian matrices.
cost_poisson_update <- function(data_new, coef, cum_coef, cmatrix, epsilon = 0.001, G = 10^10, L = -20, H = 20) {
p <- length(data_new) - 1
X_new <- data_new[1:p]
Y_new <- data_new[p + 1]
eta <- X_new %*% coef
mu <- exp(eta)
cmatrix <- cmatrix + (X_new %o% X_new) * min(as.numeric(mu), G)
lik_dev <- as.numeric(-(Y_new - mu)) * X_new
coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev)
coef <- pmin(pmax(coef, L), H)
cum_coef <- cum_coef + coef
return(list(coef, cum_coef, cmatrix))
}
#' Calculate negative log likelihood given data segment and guess of
#' coefficient.
#'
#' @param data Data to be used to calculate the negative log likelihood.
#' @param b Guess of the coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Negative log likelihood.
neg_log_lik_poisson <- function(data, b) {
p <- dim(data)[2] - 1
X <- data[, 1:p, drop = FALSE]
Y <- data[, p + 1, drop = FALSE]
u <- as.numeric(X %*% b)
L <- -Y * u + exp(u) + lfactorial(Y)
return(sum(L))
}
#' Find change points using dynamic programming with pruning and SeGD.
#'
#' @param data Data used to find change points.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param trim Propotion of the data to ignore the change points at the
#' beginning, ending and between change points.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#' the Hessian matrix.
#' @param G Upper bound for the coefficient.
#' @param L Winsorization lower bound.
#' @param H Winsorization upper bound.
#' @keywords internal
#'
#' @noRd
#' @return A list containing potential change point locations and negative log
#' likelihood for each segment based on the change points guess.
segd_poisson <- function(data, beta, B = 10, trim = 0.03, epsilon = 0.001, G = 10^10, L = -20, H = 20) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
# choose the initial values based on pre-segmentation
index <- rep(1:B, rep(n / B, B))
coef.int <- matrix(NA, B, p)
for (i in 1:B)
{
out <- fastglm::fastglm(x = as.matrix(data[index == i, 1:p]), y = data[index == i, p + 1], family = "poisson")
coef.int[i, ] <- coef(out)
}
X1 <- data[1, 1:p]
cum_coef <- coef <- pmin(pmax(matrix(coef.int[1, ], p, 1), L), H)
e_eta <- exp(coef %*% X1)
const <- e_eta
cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1))
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:(m - 1))
{
coef_c <- coef[, i]
cum_coef_c <- cum_coef[, i]
cmatrix_c <- cmatrix[, , i]
out <- cost_poisson_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, epsilon = epsilon, G = G, L = L, H = H)
coef[, i] <- out[[1]]
cum_coef[, i] <- out[[2]]
cmatrix[, , i] <- out[[3]]
k <- set[i] + 1
cum_coef_win <- pmin(pmax(cum_coef[, i] / (t - k + 1), L), H)
if (t - k >= p - 1) cval[i] <- neg_log_lik_poisson(data[k:t, ], cum_coef_win) else cval[i] <- 0
}
# the choice of initial values requires further investigation
cval[m] <- 0
Xt <- data[t, 1:p]
cum_coef_add <- coef_add <- pmin(pmax(coef.int[index[t], ], L), H)
e_eta_t <- exp(coef_add %*% Xt)
const <- e_eta_t
cmatrix_add <- (Xt %o% Xt) * as.numeric(const)
coef <- cbind(coef, coef_add)
cum_coef <- cbind(cum_coef, cum_coef_add)
cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3)
# Adding a momentum term (TBD)
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
coef <- coef[, ind2, drop = FALSE]
cum_coef <- cum_coef[, ind2, drop = FALSE]
cmatrix <- cmatrix[, , ind2, drop = FALSE]
Fobj <- c(Fobj, min_val)
}
# Remove change-points close to the boundaries
cp <- cp_set[[n + 1]]
if (length(cp) > 0) {
ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)]
if (length(ind3) > 0) cp <- cp[-ind3]
}
cp <- sort(unique(c(0, cp)))
index <- which((diff(cp) < trim * n) == TRUE)
if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2)
cp <- cp[cp > 0]
# nLL <- 0
# cp_loc <- unique(c(0,cp,n))
# for(i in 1:(length(cp_loc)-1))
# {
# seg <- (cp_loc[i]+1):cp_loc[i+1]
# data_seg <- data[seg,]
# out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="Poisson")
# nLL <- out$deviance/2 + nLL
# }
# output <- list(cp, nLL)
# names(output) <- c("cp", "nLL")
output <- list(cp)
names(output) <- c("cp")
return(output)
}
# Generate data from poisson regression models with change-points
#' @param n Number of observations.
#' @param d Dimension of the covariates.
#' @param true.coef True regression coefficients.
#' @param true.cp.loc True change-point locations.
#' @param Sigma Covariance matrix of the covariates.
#' @keywords internal
#'
#' @noRd
#' @return A list containing the generated data and the true cluster
#' assignments.
data_gen_poisson <- function(n, d, true.coef, true.cp.loc, Sigma) {
loc <- unique(c(0, true.cp.loc, n))
if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match")
x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma)
y <- NULL
for (i in 1:(length(loc) - 1))
{
mu <- exp(x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE])
group <- rpois(length(mu), mu)
y <- c(y, group)
}
data <- cbind(x, y)
true_cluster <- rep(1:(length(loc) - 1), diff(loc))
result <- list(data, true_cluster)
return(result)
}
#' Cost function for penalized linear regression.
#'
#' @param data Data to be used to calculate the cost values. The last column is
#' the response variable.
#' @param lambda Penalty coefficient.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return Cost value for the corresponding segment of data.
cost_lasso <- function(data, lambda, family = "gaussian") {
data <- as.matrix(data)
n <- dim(data)[1]
p <- dim(data)[2] - 1
out <- glmnet::glmnet(as.matrix(data[, 1:p]), data[, p + 1], family = family, lambda = lambda)
return(deviance(out) / 2)
}
#' Implementation of vanilla PELT for penalized linear regression type data.
#'
#' @param data Data to be used for change point detection.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param cost Cost function to be used to calculate cost values.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return A list consisting of two: change point locations and negative log
#' likelihood values for each segment.
pelt_vanilla_lasso <- function(data, beta, B = 10, cost = cost_lasso, family = "gaussian") {
n <- dim(data)[1]
p <- dim(data)[2] - 1
index <- rep(1:B, rep(n / B, B))
err_sd <- act_num <- rep(NA, B)
for (i in 1:B)
{
cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family)
coef <- coef(cvfit, s = "lambda.1se")[-1]
resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef)
err_sd[i] <- sqrt(mean(resi^2))
act_num[i] <- sum(abs(coef) > 0)
}
err_sd_mean <- mean(err_sd) # only works if error sd is unchanged.
act_num_mean <- mean(act_num)
beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:m)
{
k <- set[i] + 1
if (t - k >= 1) cval[i] <- suppressWarnings(cost(data[k:t, ], lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))) else cval[i] <- 0
}
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
Fobj <- c(Fobj, min_val)
if (t %% 100 == 0) print(t)
}
cp <- cp_set[[n + 1]]
# nLL <- 0
# cp_loc <- unique(c(0,cp,n))
# for(i in 1:(length(cp_loc)-1))
# {
# seg <- (cp_loc[i]+1):cp_loc[i+1]
# data_seg <- data[seg,]
# out <- glmnet(as.matrix(data_seg[, 1:p]), data_seg[, p+1], lambda=lambda, family=family)
# nLL <- deviance(out)/2 + nLL
# }
# output <- list(cp, nLL)
output <- list(cp)
names(output) <- c("cp")
return(output)
}
#' Function to update the coefficients using gradient descent.
#' @param a Coefficient to be updated.
#' @param lambda Penalty coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Updated coefficient.
soft_threshold <- function(a, lambda) {
sign(a) * pmax(abs(a) - lambda, 0)
}
#' Function to update the coefficients using gradient descent.
#'
#' @param data_new New data point used to update the coeffient.
#' @param coef Previous coeffient to be updated.
#' @param cum_coef Summation of all the past coefficients to be used in
#' averaging.
#' @param cmatrix Hessian matrix in gradient descent.
#' @param lambda Penalty coefficient.
#' @keywords internal
#'
#' @noRd
#' @return A list of values containing the new coefficients, summation of
#' coefficients so far and all the Hessian matrices.
cost_lasso_update <- function(data_new, coef, cum_coef, cmatrix, lambda) {
p <- length(data_new) - 1
X_new <- data_new[1:p]
Y_new <- data_new[p + 1]
mu <- X_new %*% coef
cmatrix <- cmatrix + X_new %o% X_new
# B <- as.vector(cmatrix_inv%*%X_new)
# cmatrix_inv <- cmatrix_inv - B%o%B/(1+sum(X_new*B))
lik_dev <- as.numeric(-(Y_new - mu)) * X_new
coef <- coef - solve(cmatrix, lik_dev)
nc <- norm(cmatrix, type = "F") # the choice of norm affects the speed. Spectral norm is more accurate but slower than F norm.
coef <- soft_threshold(coef, lambda / nc)
cum_coef <- cum_coef + coef
return(list(coef, cum_coef, cmatrix))
}
#' Calculate negative log likelihood given data segment and guess of
#' coefficient.
#'
#' @param data Data to be used to calculate the negative log likelihood.
#' @param b Guess of the coefficient.
#' @param lambda Penalty coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Negative log likelihood.
neg_log_lik_lasso <- function(data, b, lambda) {
p <- dim(data)[2] - 1
X <- data[, 1:p, drop = FALSE]
Y <- data[, p + 1, drop = FALSE]
resi <- Y - X %*% b
L <- sum(resi^2) / 2 + lambda * sum(abs(b))
return(L)
}
#' Find change points using dynamic programming with pruning and SeGD.
#'
#' @param data Data used to find change points.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param trim Propotion of the data to ignore the change points at the
#' beginning, ending and between change points.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#' the Hessian matrix.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return A list containing potential change point locations and negative log
#' likelihood for each segment based on the change points guess.
segd_lasso <- function(data, beta, B = 10, trim = 0.025, epsilon = 1e-5, family = "gaussian") {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
# choose the initial values based on pre-segmentation
index <- rep(1:B, rep(n / B, B))
coef.int <- matrix(NA, B, p)
err_sd <- act_num <- rep(NA, B)
for (i in 1:B)
{
cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family)
coef.int[i, ] <- coef(cvfit, s = "lambda.1se")[-1]
resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef.int[i, ])
err_sd[i] <- sqrt(mean(resi^2))
act_num[i] <- sum(abs(coef.int[i, ]) > 0)
}
err_sd_mean <- mean(err_sd) # only works if error sd is unchanged.
act_num_mean <- mean(act_num)
beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices
X1 <- data[1, 1:p]
cum_coef <- coef <- matrix(coef.int[1, ], p, 1)
eta <- coef %*% X1
# c_int <- diag(1/epsilon,p) - X1%o%X1/epsilon^2/(1+sum(X1^2)/epsilon)
# cmatrix_inv <- array(c_int, c(p,p,1))
cmatrix <- array(X1 %o% X1 + epsilon * diag(1, p), c(p, p, 1))
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:(m - 1))
{
coef_c <- coef[, i]
cum_coef_c <- cum_coef[, i]
# cmatrix_inv_c <- cmatrix_inv[,,i]
cmatrix_c <- cmatrix[, , i]
k <- set[i] + 1
out <- cost_lasso_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))
coef[, i] <- out[[1]]
cum_coef[, i] <- out[[2]]
# cmatrix_inv[,,i] <- out[[3]]
cmatrix[, , i] <- out[[3]]
if (t - k >= 2) cval[i] <- neg_log_lik_lasso(data[k:t, ], cum_coef[, i] / (t - k + 1), lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) else cval[i] <- 0
}
# the choice of initial values requires further investigation
cval[m] <- 0
Xt <- data[t, 1:p]
cum_coef_add <- coef_add <- coef.int[index[t], ]
# cmatrix_inv_add <- diag(1/epsilon,p) - Xt%o%Xt/epsilon^2/(1+sum(Xt^2)/epsilon)
coef <- cbind(coef, coef_add)
cum_coef <- cbind(cum_coef, cum_coef_add)
# cmatrix_inv <- abind::abind(cmatrix_inv, cmatrix_inv_add, along=3)
cmatrix <- abind::abind(cmatrix, Xt %o% Xt + epsilon * diag(1, p), along = 3)
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
coef <- coef[, ind2, drop = FALSE]
cum_coef <- cum_coef[, ind2, drop = FALSE]
cmatrix <- cmatrix[, , ind2, drop = FALSE]
# cmatrix_inv <- cmatrix_inv[,,ind2,drop=FALSE]
Fobj <- c(Fobj, min_val)
}
# Remove change-points close to the boundaries and merge change-points
cp <- cp_set[[n + 1]]
if (length(cp) > 0) {
ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)]
if (length(ind3) > 0) cp <- cp[-ind3]
}
cp <- sort(unique(c(0, cp)))
index <- which((diff(cp) < trim * n) == TRUE)
if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2)
cp <- cp[cp > 0]
# nLL <- 0
# cp_loc <- unique(c(0,cp,n))
# for(i in 1:(length(cp_loc)-1))
# {
# seg <- (cp_loc[i]+1):cp_loc[i+1]
# data_seg <- data[seg,]
# out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="binomial")
# nLL <- out$deviance/2 + nLL
# }
output <- list(cp)
names(output) <- c("cp")
return(output)
}
# Generate data from penalized linear regression models with change-points
#' @param n Number of observations.
#' @param d Dimension of the covariates.
#' @param true.coef True regression coefficients.
#' @param true.cp.loc True change-point locations.
#' @param Sigma Covariance matrix of the covariates.
#' @param evar Error variance.
#' @keywords internal
#'
#' @noRd
#' @return A list containing the generated data and the true cluster
#' assignments.
data_gen_lasso <- function(n, d, true.coef, true.cp.loc, Sigma, evar) {
loc <- unique(c(0, true.cp.loc, n))
if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match")
x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma)
y <- NULL
for (i in 1:(length(loc) - 1))
{
Xb <- x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE]
add <- Xb + rnorm(length(Xb), sd = sqrt(evar))
y <- c(y, add)
}
data <- cbind(x, y)
true_cluster <- rep(1:(length(loc) - 1), diff(loc))
result <- list(data, true_cluster)
return(result)
}
set.seed(1)
p <- 5
x <- matrix(rnorm(300 * p, 0, 1), ncol = p)
# Randomly generate coefficients with different means.
theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1))
# Randomly generate response variables based on the segmented data and
# corresponding coefficients
y <- c(
rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))),
rbinom(300 - 125, 1, 1 / (1 + exp(-x[(125 + 1):300, ] %*% theta[2, ])))
)
segd_binomial(cbind(x, y), (p + 1) * log(300) / 2, B = 5)$cp
#> [1] 125
fastcpd.binomial(
cbind(y, x),
segment_count = 5,
beta = "BIC",
cost_adjustment = "BIC",
r.progress = FALSE
)@cp_set
#> [1] 125
pelt_vanilla_binomial(cbind(x, y), (p + 1) * log(300) / 2)$cp
#> [1] 0 125
fastcpd.binomial(
cbind(y, x),
segment_count = 5,
vanilla_percentage = 1,
beta = "BIC",
cost_adjustment = "BIC",
r.progress = FALSE
)@cp_set
#> [1] 125
set.seed(1)
n <- 1500
d <- 5
rho <- 0.9
Sigma <- array(0, c(d, d))
for (i in 1:d) {
Sigma[i, ] <- rho^(abs(i - (1:d)))
}
delta <- c(5, 7, 9, 11, 13)
a.sq <- 1
delta.new <-
delta * sqrt(a.sq) / sqrt(as.numeric(t(delta) %*% Sigma %*% delta))
true.cp.loc <- c(375, 750, 1125)
# regression coefficients
true.coef <- matrix(0, nrow = d, ncol = length(true.cp.loc) + 1)
true.coef[, 1] <- c(1, 1.2, -1, 0.5, -2)
true.coef[, 2] <- true.coef[, 1] + delta.new
true.coef[, 3] <- true.coef[, 1]
true.coef[, 4] <- true.coef[, 3] - delta.new
out <- data_gen_poisson(n, d, true.coef, true.cp.loc, Sigma)
data <- out[[1]]
g_tr <- out[[2]]
beta <- log(n) * (d + 1) / 2
segd_poisson(
data, beta, trim = 0.03, B = 10, epsilon = 0.001, G = 10^10, L = -20, H = 20
)$cp
#> [1] 380 751 1136 1251
fastcpd.poisson(
cbind(data[, d + 1], data[, 1:d]),
beta = beta,
cost_adjustment = "BIC",
epsilon = 0.001,
segment_count = 10,
r.progress = FALSE
)@cp_set
#> [1] 380 751 1136 1251
pelt_vanilla_poisson(data, beta)$cp
#> [1] 0 374 752 1133
fastcpd.poisson(
cbind(data[, d + 1], data[, 1:d]),
segment_count = 10,
vanilla_percentage = 1,
beta = beta,
cost_adjustment = "BIC",
r.progress = FALSE
)@cp_set
#> [1] 374 752 1133
set.seed(1)
n <- 1000
s <- 3
d <- 50
evar <- 0.5
Sigma <- diag(1, d)
true.cp.loc <- c(100, 300, 500, 800, 900)
seg <- length(true.cp.loc) + 1
true.coef <- matrix(rnorm(seg * s), s, seg)
true.coef <- rbind(true.coef, matrix(0, d - s, seg))
out <- data_gen_lasso(n, d, true.coef, true.cp.loc, Sigma, evar)
data <- out[[1]]
beta <- log(n) / 2 # beta here has different meaning
segd_lasso(data, beta, B = 10, trim = 0.025)$cp
#> [1] 100 300 520 800 901
fastcpd.lasso(
cbind(data[, d + 1], data[, 1:d]),
epsilon = 1e-5,
beta = beta,
cost_adjustment = "BIC",
pruning_coef = 0,
r.progress = FALSE
)@cp_set
#> [1] 100 300 520 800 901
pelt_vanilla_lasso(data, beta, cost = cost_lasso)$cp
#> [1] 100
#> [1] 200
#> [1] 300
#> [1] 400
#> [1] 500
#> [1] 600
#> [1] 700
#> [1] 800
#> [1] 900
#> [1] 1000
#> [1] 0 103 299 510 800 900
fastcpd.lasso(
cbind(data[, d + 1], data[, 1:d]),
vanilla_percentage = 1,
epsilon = 1e-5,
beta = beta,
cost_adjustment = "BIC",
pruning_coef = 0,
r.progress = FALSE
)@cp_set
#> [1] 103 299 510 800 900
The evaluation of this vignette is set to be FALSE
.
knitr::opts_chunk$set(
collapse = TRUE, comment = "#>", eval = FALSE, cache = FALSE,
warning = FALSE, fig.width = 8, fig.height = 5
)
library(fastcpd)
#' Cost function for Logistic regression, i.e. binomial family in GLM.
#'
#' @param data Data to be used to calculate the cost values. The last column is
#' the response variable.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return Cost value for the corresponding segment of data.
cost_glm_binomial <- function(data, family = "binomial") {
data <- as.matrix(data)
p <- dim(data)[2] - 1
out <- fastglm::fastglm(
as.matrix(data[, 1:p]), data[, p + 1],
family = family
)
return(out$deviance / 2)
}
#' Implementation of vanilla PELT for logistic regression type data.
#'
#' @param data Data to be used for change point detection.
#' @param beta Penalty coefficient for the number of change points.
#' @param cost Cost function to be used to calculate cost values.
#' @keywords internal
#'
#' @noRd
#' @return A list consisting of two: change point locations and negative log
#' likelihood values for each segment.
pelt_vanilla_binomial <- function(data, beta, cost = cost_glm_binomial) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:m)
{
k <- set[i] + 1
cval[i] <- 0
if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ]))
}
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
Fobj <- c(Fobj, min_val)
}
cp <- cp_set[[n + 1]]
nLL <- 0
cp_loc <- unique(c(0, cp, n))
for (i in 1:(length(cp_loc) - 1))
{
seg <- (cp_loc[i] + 1):cp_loc[i + 1]
data_seg <- data[seg, ]
out <- fastglm::fastglm(
as.matrix(data_seg[, 1:p]), data_seg[, p + 1],
family = "binomial"
)
nLL <- out$deviance / 2 + nLL
}
output <- list(cp, nLL)
names(output) <- c("cp", "nLL")
return(output)
}
#' Function to update the coefficients using gradient descent.
#'
#' @param data_new New data point used to update the coeffient.
#' @param coef Previous coeffient to be updated.
#' @param cum_coef Summation of all the past coefficients to be used in
#' averaging.
#' @param cmatrix Hessian matrix in gradient descent.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#' the Hessian matrix.
#' @keywords internal
#'
#' @noRd
#' @return A list of values containing the new coefficients, summation of
#' coefficients so far and all the Hessian matrices.
cost_logistic_update <- function(
data_new, coef, cum_coef, cmatrix, epsilon = 1e-10) {
p <- length(data_new) - 1
X_new <- data_new[1:p]
Y_new <- data_new[p + 1]
eta <- X_new %*% coef
mu <- 1 / (1 + exp(-eta))
cmatrix <- cmatrix + (X_new %o% X_new) * as.numeric((1 - mu) * mu)
lik_dev <- as.numeric(-(Y_new - mu)) * X_new
coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev)
cum_coef <- cum_coef + coef
return(list(coef, cum_coef, cmatrix))
}
#' Calculate negative log likelihood given data segment and guess of
#' coefficient.
#'
#' @param data Data to be used to calculate the negative log likelihood.
#' @param b Guess of the coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Negative log likelihood.
neg_log_lik_binomial <- function(data, b) {
p <- dim(data)[2] - 1
X <- data[, 1:p, drop = FALSE]
Y <- data[, p + 1, drop = FALSE]
u <- as.numeric(X %*% b)
L <- -Y * u + log(1 + exp(u))
return(sum(L))
}
#' Find change points using dynamic programming with pruning and SeGD.
#'
#' @param data Data used to find change points.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param trim Propotion of the data to ignore the change points at the
#' beginning, ending and between change points.
#' @keywords internal
#'
#' @noRd
#' @return A list containing potential change point locations and negative log
#' likelihood for each segment based on the change points guess.
segd_binomial <- function(data, beta, B = 10, trim = 0.025) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
# choose the initial values based on pre-segmentation
index <- rep(1:B, rep(n / B, B))
coef.int <- matrix(NA, B, p)
for (i in 1:B)
{
out <- fastglm::fastglm(
as.matrix(data[index == i, 1:p]),
data[index == i, p + 1],
family = "binomial"
)
coef.int[i, ] <- coef(out)
}
X1 <- data[1, 1:p]
cum_coef <- coef <- matrix(coef.int[1, ], p, 1)
e_eta <- exp(coef %*% X1)
const <- e_eta / (1 + e_eta)^2
cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1))
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:(m - 1))
{
coef_c <- coef[, i]
cum_coef_c <- cum_coef[, i]
cmatrix_c <- cmatrix[, , i]
out <- cost_logistic_update(data[t, ], coef_c, cum_coef_c, cmatrix_c)
coef[, i] <- out[[1]]
cum_coef[, i] <- out[[2]]
cmatrix[, , i] <- out[[3]]
k <- set[i] + 1
cval[i] <- 0
if (t - k >= p - 1) {
cval[i] <-
neg_log_lik_binomial(data[k:t, ], cum_coef[, i] / (t - k + 1))
}
}
# the choice of initial values requires further investigation
cval[m] <- 0
Xt <- data[t, 1:p]
cum_coef_add <- coef_add <- coef.int[index[t], ]
e_eta_t <- exp(coef_add %*% Xt)
const <- e_eta_t / (1 + e_eta_t)^2
cmatrix_add <- (Xt %o% Xt) * as.numeric(const)
coef <- cbind(coef, coef_add)
cum_coef <- cbind(cum_coef, cum_coef_add)
cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3)
# Adding a momentum term (TBD)
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
coef <- coef[, ind2, drop = FALSE]
cum_coef <- cum_coef[, ind2, drop = FALSE]
cmatrix <- cmatrix[, , ind2, drop = FALSE]
Fobj <- c(Fobj, min_val)
}
# Remove change-points close to the boundaries
cp <- cp_set[[n + 1]]
if (length(cp) > 0) {
ind3 <- (seq_len(length(cp)))[(cp < trim * n) | (cp > (1 - trim) * n)]
cp <- cp[-ind3]
}
nLL <- 0
cp_loc <- unique(c(0, cp, n))
for (i in 1:(length(cp_loc) - 1))
{
seg <- (cp_loc[i] + 1):cp_loc[i + 1]
data_seg <- data[seg, ]
out <- fastglm::fastglm(
as.matrix(data_seg[, 1:p]), data_seg[, p + 1],
family = "binomial"
)
nLL <- out$deviance / 2 + nLL
}
output <- list(cp, nLL)
names(output) <- c("cp", "nLL")
return(output)
}
#' Cost function for Poisson regression.
#'
#' @param data Data to be used to calculate the cost values. The last column is
#' the response variable.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return Cost value for the corresponding segment of data.
cost_glm_poisson <- function(data, family = "poisson") {
data <- as.matrix(data)
p <- dim(data)[2] - 1
out <- fastglm::fastglm(as.matrix(data[, 1:p]), data[, p + 1], family = family)
return(out$deviance / 2)
}
#' Implementation of vanilla PELT for poisson regression type data.
#'
#' @param data Data to be used for change point detection.
#' @param beta Penalty coefficient for the number of change points.
#' @param cost Cost function to be used to calculate cost values.
#' @keywords internal
#'
#' @noRd
#' @return A list consisting of two: change point locations and negative log
#' likelihood values for each segment.
pelt_vanilla_poisson <- function(data, beta, cost = cost_glm_poisson) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:m)
{
k <- set[i] + 1
if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) else cval[i] <- 0
}
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
Fobj <- c(Fobj, min_val)
# if (t %% 100 == 0) print(t)
}
cp <- cp_set[[n + 1]]
output <- list(cp)
names(output) <- c("cp")
return(output)
}
#' Function to update the coefficients using gradient descent.
#'
#' @param data_new New data point used to update the coeffient.
#' @param coef Previous coeffient to be updated.
#' @param cum_coef Summation of all the past coefficients to be used in
#' averaging.
#' @param cmatrix Hessian matrix in gradient descent.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#' the Hessian matrix.
#' @param G Upper bound for the coefficient.
#' @param L Winsorization lower bound.
#' @param H Winsorization upper bound.
#' @keywords internal
#'
#' @noRd
#' @return A list of values containing the new coefficients, summation of
#' coefficients so far and all the Hessian matrices.
cost_poisson_update <- function(data_new, coef, cum_coef, cmatrix, epsilon = 0.001, G = 10^10, L = -20, H = 20) {
p <- length(data_new) - 1
X_new <- data_new[1:p]
Y_new <- data_new[p + 1]
eta <- X_new %*% coef
mu <- exp(eta)
cmatrix <- cmatrix + (X_new %o% X_new) * min(as.numeric(mu), G)
lik_dev <- as.numeric(-(Y_new - mu)) * X_new
coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev)
coef <- pmin(pmax(coef, L), H)
cum_coef <- cum_coef + coef
return(list(coef, cum_coef, cmatrix))
}
#' Calculate negative log likelihood given data segment and guess of
#' coefficient.
#'
#' @param data Data to be used to calculate the negative log likelihood.
#' @param b Guess of the coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Negative log likelihood.
neg_log_lik_poisson <- function(data, b) {
p <- dim(data)[2] - 1
X <- data[, 1:p, drop = FALSE]
Y <- data[, p + 1, drop = FALSE]
u <- as.numeric(X %*% b)
L <- -Y * u + exp(u) + lfactorial(Y)
return(sum(L))
}
#' Find change points using dynamic programming with pruning and SeGD.
#'
#' @param data Data used to find change points.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param trim Propotion of the data to ignore the change points at the
#' beginning, ending and between change points.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#' the Hessian matrix.
#' @param G Upper bound for the coefficient.
#' @param L Winsorization lower bound.
#' @param H Winsorization upper bound.
#' @keywords internal
#'
#' @noRd
#' @return A list containing potential change point locations and negative log
#' likelihood for each segment based on the change points guess.
segd_poisson <- function(data, beta, B = 10, trim = 0.03, epsilon = 0.001, G = 10^10, L = -20, H = 20) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
# choose the initial values based on pre-segmentation
index <- rep(1:B, rep(n / B, B))
coef.int <- matrix(NA, B, p)
for (i in 1:B)
{
out <- fastglm::fastglm(x = as.matrix(data[index == i, 1:p]), y = data[index == i, p + 1], family = "poisson")
coef.int[i, ] <- coef(out)
}
X1 <- data[1, 1:p]
cum_coef <- coef <- pmin(pmax(matrix(coef.int[1, ], p, 1), L), H)
e_eta <- exp(coef %*% X1)
const <- e_eta
cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1))
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:(m - 1))
{
coef_c <- coef[, i]
cum_coef_c <- cum_coef[, i]
cmatrix_c <- cmatrix[, , i]
out <- cost_poisson_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, epsilon = epsilon, G = G, L = L, H = H)
coef[, i] <- out[[1]]
cum_coef[, i] <- out[[2]]
cmatrix[, , i] <- out[[3]]
k <- set[i] + 1
cum_coef_win <- pmin(pmax(cum_coef[, i] / (t - k + 1), L), H)
if (t - k >= p - 1) cval[i] <- neg_log_lik_poisson(data[k:t, ], cum_coef_win) else cval[i] <- 0
}
# the choice of initial values requires further investigation
cval[m] <- 0
Xt <- data[t, 1:p]
cum_coef_add <- coef_add <- pmin(pmax(coef.int[index[t], ], L), H)
e_eta_t <- exp(coef_add %*% Xt)
const <- e_eta_t
cmatrix_add <- (Xt %o% Xt) * as.numeric(const)
coef <- cbind(coef, coef_add)
cum_coef <- cbind(cum_coef, cum_coef_add)
cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3)
# Adding a momentum term (TBD)
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
coef <- coef[, ind2, drop = FALSE]
cum_coef <- cum_coef[, ind2, drop = FALSE]
cmatrix <- cmatrix[, , ind2, drop = FALSE]
Fobj <- c(Fobj, min_val)
}
# Remove change-points close to the boundaries
cp <- cp_set[[n + 1]]
if (length(cp) > 0) {
ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)]
if (length(ind3) > 0) cp <- cp[-ind3]
}
cp <- sort(unique(c(0, cp)))
index <- which((diff(cp) < trim * n) == TRUE)
if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2)
cp <- cp[cp > 0]
# nLL <- 0
# cp_loc <- unique(c(0,cp,n))
# for(i in 1:(length(cp_loc)-1))
# {
# seg <- (cp_loc[i]+1):cp_loc[i+1]
# data_seg <- data[seg,]
# out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="Poisson")
# nLL <- out$deviance/2 + nLL
# }
# output <- list(cp, nLL)
# names(output) <- c("cp", "nLL")
output <- list(cp)
names(output) <- c("cp")
return(output)
}
# Generate data from poisson regression models with change-points
#' @param n Number of observations.
#' @param d Dimension of the covariates.
#' @param true.coef True regression coefficients.
#' @param true.cp.loc True change-point locations.
#' @param Sigma Covariance matrix of the covariates.
#' @keywords internal
#'
#' @noRd
#' @return A list containing the generated data and the true cluster
#' assignments.
data_gen_poisson <- function(n, d, true.coef, true.cp.loc, Sigma) {
loc <- unique(c(0, true.cp.loc, n))
if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match")
x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma)
y <- NULL
for (i in 1:(length(loc) - 1))
{
mu <- exp(x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE])
group <- rpois(length(mu), mu)
y <- c(y, group)
}
data <- cbind(x, y)
true_cluster <- rep(1:(length(loc) - 1), diff(loc))
result <- list(data, true_cluster)
return(result)
}
#' Cost function for penalized linear regression.
#'
#' @param data Data to be used to calculate the cost values. The last column is
#' the response variable.
#' @param lambda Penalty coefficient.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return Cost value for the corresponding segment of data.
cost_lasso <- function(data, lambda, family = "gaussian") {
data <- as.matrix(data)
n <- dim(data)[1]
p <- dim(data)[2] - 1
out <- glmnet::glmnet(as.matrix(data[, 1:p]), data[, p + 1], family = family, lambda = lambda)
return(deviance(out) / 2)
}
#' Implementation of vanilla PELT for penalized linear regression type data.
#'
#' @param data Data to be used for change point detection.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param cost Cost function to be used to calculate cost values.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return A list consisting of two: change point locations and negative log
#' likelihood values for each segment.
pelt_vanilla_lasso <- function(data, beta, B = 10, cost = cost_lasso, family = "gaussian") {
n <- dim(data)[1]
p <- dim(data)[2] - 1
index <- rep(1:B, rep(n / B, B))
err_sd <- act_num <- rep(NA, B)
for (i in 1:B)
{
cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family)
coef <- coef(cvfit, s = "lambda.1se")[-1]
resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef)
err_sd[i] <- sqrt(mean(resi^2))
act_num[i] <- sum(abs(coef) > 0)
}
err_sd_mean <- mean(err_sd) # only works if error sd is unchanged.
act_num_mean <- mean(act_num)
beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:m)
{
k <- set[i] + 1
if (t - k >= 1) cval[i] <- suppressWarnings(cost(data[k:t, ], lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))) else cval[i] <- 0
}
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
Fobj <- c(Fobj, min_val)
if (t %% 100 == 0) print(t)
}
cp <- cp_set[[n + 1]]
# nLL <- 0
# cp_loc <- unique(c(0,cp,n))
# for(i in 1:(length(cp_loc)-1))
# {
# seg <- (cp_loc[i]+1):cp_loc[i+1]
# data_seg <- data[seg,]
# out <- glmnet(as.matrix(data_seg[, 1:p]), data_seg[, p+1], lambda=lambda, family=family)
# nLL <- deviance(out)/2 + nLL
# }
# output <- list(cp, nLL)
output <- list(cp)
names(output) <- c("cp")
return(output)
}
#' Function to update the coefficients using gradient descent.
#' @param a Coefficient to be updated.
#' @param lambda Penalty coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Updated coefficient.
soft_threshold <- function(a, lambda) {
sign(a) * pmax(abs(a) - lambda, 0)
}
#' Function to update the coefficients using gradient descent.
#'
#' @param data_new New data point used to update the coeffient.
#' @param coef Previous coeffient to be updated.
#' @param cum_coef Summation of all the past coefficients to be used in
#' averaging.
#' @param cmatrix Hessian matrix in gradient descent.
#' @param lambda Penalty coefficient.
#' @keywords internal
#'
#' @noRd
#' @return A list of values containing the new coefficients, summation of
#' coefficients so far and all the Hessian matrices.
cost_lasso_update <- function(data_new, coef, cum_coef, cmatrix, lambda) {
p <- length(data_new) - 1
X_new <- data_new[1:p]
Y_new <- data_new[p + 1]
mu <- X_new %*% coef
cmatrix <- cmatrix + X_new %o% X_new
# B <- as.vector(cmatrix_inv%*%X_new)
# cmatrix_inv <- cmatrix_inv - B%o%B/(1+sum(X_new*B))
lik_dev <- as.numeric(-(Y_new - mu)) * X_new
coef <- coef - solve(cmatrix, lik_dev)
nc <- norm(cmatrix, type = "F") # the choice of norm affects the speed. Spectral norm is more accurate but slower than F norm.
coef <- soft_threshold(coef, lambda / nc)
cum_coef <- cum_coef + coef
return(list(coef, cum_coef, cmatrix))
}
#' Calculate negative log likelihood given data segment and guess of
#' coefficient.
#'
#' @param data Data to be used to calculate the negative log likelihood.
#' @param b Guess of the coefficient.
#' @param lambda Penalty coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Negative log likelihood.
neg_log_lik_lasso <- function(data, b, lambda) {
p <- dim(data)[2] - 1
X <- data[, 1:p, drop = FALSE]
Y <- data[, p + 1, drop = FALSE]
resi <- Y - X %*% b
L <- sum(resi^2) / 2 + lambda * sum(abs(b))
return(L)
}
#' Find change points using dynamic programming with pruning and SeGD.
#'
#' @param data Data used to find change points.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param trim Propotion of the data to ignore the change points at the
#' beginning, ending and between change points.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#' the Hessian matrix.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return A list containing potential change point locations and negative log
#' likelihood for each segment based on the change points guess.
segd_lasso <- function(data, beta, B = 10, trim = 0.025, epsilon = 1e-5, family = "gaussian") {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
# choose the initial values based on pre-segmentation
index <- rep(1:B, rep(n / B, B))
coef.int <- matrix(NA, B, p)
err_sd <- act_num <- rep(NA, B)
for (i in 1:B)
{
cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family)
coef.int[i, ] <- coef(cvfit, s = "lambda.1se")[-1]
resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef.int[i, ])
err_sd[i] <- sqrt(mean(resi^2))
act_num[i] <- sum(abs(coef.int[i, ]) > 0)
}
err_sd_mean <- mean(err_sd) # only works if error sd is unchanged.
act_num_mean <- mean(act_num)
beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices
X1 <- data[1, 1:p]
cum_coef <- coef <- matrix(coef.int[1, ], p, 1)
eta <- coef %*% X1
# c_int <- diag(1/epsilon,p) - X1%o%X1/epsilon^2/(1+sum(X1^2)/epsilon)
# cmatrix_inv <- array(c_int, c(p,p,1))
cmatrix <- array(X1 %o% X1 + epsilon * diag(1, p), c(p, p, 1))
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:(m - 1))
{
coef_c <- coef[, i]
cum_coef_c <- cum_coef[, i]
# cmatrix_inv_c <- cmatrix_inv[,,i]
cmatrix_c <- cmatrix[, , i]
k <- set[i] + 1
out <- cost_lasso_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))
coef[, i] <- out[[1]]
cum_coef[, i] <- out[[2]]
# cmatrix_inv[,,i] <- out[[3]]
cmatrix[, , i] <- out[[3]]
if (t - k >= 2) cval[i] <- neg_log_lik_lasso(data[k:t, ], cum_coef[, i] / (t - k + 1), lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) else cval[i] <- 0
}
# the choice of initial values requires further investigation
cval[m] <- 0
Xt <- data[t, 1:p]
cum_coef_add <- coef_add <- coef.int[index[t], ]
# cmatrix_inv_add <- diag(1/epsilon,p) - Xt%o%Xt/epsilon^2/(1+sum(Xt^2)/epsilon)
coef <- cbind(coef, coef_add)
cum_coef <- cbind(cum_coef, cum_coef_add)
# cmatrix_inv <- abind::abind(cmatrix_inv, cmatrix_inv_add, along=3)
cmatrix <- abind::abind(cmatrix, Xt %o% Xt + epsilon * diag(1, p), along = 3)
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
cp_set <- append(cp_set, list(cp_set_add))
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
coef <- coef[, ind2, drop = FALSE]
cum_coef <- cum_coef[, ind2, drop = FALSE]
cmatrix <- cmatrix[, , ind2, drop = FALSE]
# cmatrix_inv <- cmatrix_inv[,,ind2,drop=FALSE]
Fobj <- c(Fobj, min_val)
}
# Remove change-points close to the boundaries and merge change-points
cp <- cp_set[[n + 1]]
if (length(cp) > 0) {
ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)]
if (length(ind3) > 0) cp <- cp[-ind3]
}
cp <- sort(unique(c(0, cp)))
index <- which((diff(cp) < trim * n) == TRUE)
if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2)
cp <- cp[cp > 0]
# nLL <- 0
# cp_loc <- unique(c(0,cp,n))
# for(i in 1:(length(cp_loc)-1))
# {
# seg <- (cp_loc[i]+1):cp_loc[i+1]
# data_seg <- data[seg,]
# out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="binomial")
# nLL <- out$deviance/2 + nLL
# }
output <- list(cp)
names(output) <- c("cp")
return(output)
}
# Generate data from penalized linear regression models with change-points
#' @param n Number of observations.
#' @param d Dimension of the covariates.
#' @param true.coef True regression coefficients.
#' @param true.cp.loc True change-point locations.
#' @param Sigma Covariance matrix of the covariates.
#' @param evar Error variance.
#' @keywords internal
#'
#' @noRd
#' @return A list containing the generated data and the true cluster
#' assignments.
data_gen_lasso <- function(n, d, true.coef, true.cp.loc, Sigma, evar) {
loc <- unique(c(0, true.cp.loc, n))
if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match")
x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma)
y <- NULL
for (i in 1:(length(loc) - 1))
{
Xb <- x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE]
add <- Xb + rnorm(length(Xb), sd = sqrt(evar))
y <- c(y, add)
}
data <- cbind(x, y)
true_cluster <- rep(1:(length(loc) - 1), diff(loc))
result <- list(data, true_cluster)
return(result)
}
set.seed(1)
p <- 5
x <- matrix(rnorm(300 * p, 0, 1), ncol = p)
# Randomly generate coefficients with different means.
theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1))
# Randomly generate response variables based on the segmented data and
# corresponding coefficients
y <- c(
rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))),
rbinom(300 - 125, 1, 1 / (1 + exp(-x[(125 + 1):300, ] %*% theta[2, ])))
)
segd_binomial(cbind(x, y), (p + 1) * log(300) / 2, B = 5)$cp
#> [1] 125
fastcpd.binomial(
cbind(y, x),
segment_count = 5,
beta = "BIC",
cost_adjustment = "BIC",
r.progress = FALSE
)@cp_set
#> [1] 125
pelt_vanilla_binomial(cbind(x, y), (p + 1) * log(300) / 2)$cp
#> [1] 0 125
fastcpd.binomial(
cbind(y, x),
segment_count = 5,
vanilla_percentage = 1,
beta = "BIC",
cost_adjustment = "BIC",
r.progress = FALSE
)@cp_set
#> [1] 125
set.seed(1)
n <- 1500
d <- 5
rho <- 0.9
Sigma <- array(0, c(d, d))
for (i in 1:d) {
Sigma[i, ] <- rho^(abs(i - (1:d)))
}
delta <- c(5, 7, 9, 11, 13)
a.sq <- 1
delta.new <-
delta * sqrt(a.sq) / sqrt(as.numeric(t(delta) %*% Sigma %*% delta))
true.cp.loc <- c(375, 750, 1125)
# regression coefficients
true.coef <- matrix(0, nrow = d, ncol = length(true.cp.loc) + 1)
true.coef[, 1] <- c(1, 1.2, -1, 0.5, -2)
true.coef[, 2] <- true.coef[, 1] + delta.new
true.coef[, 3] <- true.coef[, 1]
true.coef[, 4] <- true.coef[, 3] - delta.new
out <- data_gen_poisson(n, d, true.coef, true.cp.loc, Sigma)
data <- out[[1]]
g_tr <- out[[2]]
beta <- log(n) * (d + 1) / 2
segd_poisson(
data, beta, trim = 0.03, B = 10, epsilon = 0.001, G = 10^10, L = -20, H = 20
)$cp
#> [1] 380 751 1136 1251
fastcpd.poisson(
cbind(data[, d + 1], data[, 1:d]),
beta = beta,
cost_adjustment = "BIC",
epsilon = 0.001,
segment_count = 10,
r.progress = FALSE
)@cp_set
#> [1] 380 751 1136 1251
pelt_vanilla_poisson(data, beta)$cp
#> [1] 0 374 752 1133
fastcpd.poisson(
cbind(data[, d + 1], data[, 1:d]),
segment_count = 10,
vanilla_percentage = 1,
beta = beta,
cost_adjustment = "BIC",
r.progress = FALSE
)@cp_set
#> [1] 374 752 1133
set.seed(1)
n <- 1000
s <- 3
d <- 50
evar <- 0.5
Sigma <- diag(1, d)
true.cp.loc <- c(100, 300, 500, 800, 900)
seg <- length(true.cp.loc) + 1
true.coef <- matrix(rnorm(seg * s), s, seg)
true.coef <- rbind(true.coef, matrix(0, d - s, seg))
out <- data_gen_lasso(n, d, true.coef, true.cp.loc, Sigma, evar)
data <- out[[1]]
beta <- log(n) / 2 # beta here has different meaning
segd_lasso(data, beta, B = 10, trim = 0.025)$cp
#> [1] 100 300 520 800 901
fastcpd.lasso(
cbind(data[, d + 1], data[, 1:d]),
epsilon = 1e-5,
beta = beta,
cost_adjustment = "BIC",
pruning_coef = 0,
r.progress = FALSE
)@cp_set
#> [1] 100 300 520 800 901
pelt_vanilla_lasso(data, beta, cost = cost_lasso)$cp
#> [1] 100
#> [1] 200
#> [1] 300
#> [1] 400
#> [1] 500
#> [1] 600
#> [1] 700
#> [1] 800
#> [1] 900
#> [1] 1000
#> [1] 0 103 299 510 800 900
fastcpd.lasso(
cbind(data[, d + 1], data[, 1:d]),
vanilla_percentage = 1,
epsilon = 1e-5,
beta = beta,
cost_adjustment = "BIC",
pruning_coef = 0,
r.progress = FALSE
)@cp_set
#> [1] 103 299 510 800 900
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.