## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 4
)

## ----echo = FALSE, results = "asis"-------------------------------------------
cat(
  "$$L_o^{\\mathrm{marg}}(\\gamma) + \\lambda\\bigl[",
  "L_g^{\\mathrm{marg}}(\\gamma) - L_p^{\\mathrm{marg}}(\\gamma)\\bigr]$$"
)

## ----simulate-----------------------------------------------------------------
library(mixedsubjectsirt)
library(ggplot2)

set.seed(242424)

n_human    <- 400
n_generated <- 1200
n_items    <- 8

true_pars <- data.frame(
  item = paste0("Item", seq_len(n_items)),
  a    = seq(0.8, 1.6, length.out = n_items),
  d    = seq(-1.1, 1.1, length.out = n_items)
)
true_pars$b <- -true_pars$d / true_pars$a

theta_human <- rnorm(n_human)
observed    <- simulate_2pl(theta_human, true_pars)

# Strongly informative predictions. On the n labeled subjects, predictions
# match human responses (the F = Y benchmark from the simulation study), and
# N additional unlabeled responses are drawn from the same 2PL. This is
# the "good predictor" regime, where the method should lean on unlabeled data.
# (Further down we show the complementary case of an uninformative predictor,
# where the criterion instead drives lambda to 0.)
predicted <- observed
generated <- simulate_2pl(rnorm(n_generated), true_pars)

## ----human-baseline-----------------------------------------------------------
human_start <- fit_2pl(observed, technical = list(NCYCLES = 500))

## ----mml-fit------------------------------------------------------------------
mixed_mml <- fit_mixed_subjects_mml(
  observed  = observed,
  predicted = predicted,
  generated = generated,
  lambda    = 0.5,
  initial_pars = human_start$pars
)

mixed_mml

## ----mml-pars-----------------------------------------------------------------
mixed_mml$item_pars

## ----tune-lambda--------------------------------------------------------------
ability_tuned <- tune_lambda_ability_risk(
  observed     = observed,
  predicted    = predicted,
  generated    = generated,
  target_resp  = observed,
  initial_pars = human_start$pars,
  fit_fn       = fit_mixed_subjects_mml,
  control      = list(maxit = 200)
)

ability_tuned$best_lambda

## ----crossfit-tune------------------------------------------------------------
cf_tuned <- tune_lambda_ability_risk_crossfit(
  observed     = observed,
  predicted    = predicted,
  generated    = generated,
  initial_pars = human_start$pars,
  fit_fn       = fit_mixed_subjects_mml,
  final_fit_fn = fit_mixed_subjects_mml,
  n_splits     = 2,          # the standard PPI sample split (also the default)
  control      = list(maxit = 200)
)

cf_tuned$lambda_by_split   # one tuned lambda per held-out fold
cf_tuned$lambda_final      # fold-size-weighted scalar used for the final fit

## ----covariance---------------------------------------------------------------
Sigma <- vcov(cf_tuned$final_fit)
dim(Sigma)  # 2J × 2J

## ----compare------------------------------------------------------------------
human_only <- fit_mixed_subjects_mml(
  observed  = observed,
  predicted = predicted,
  generated = generated,
  lambda    = 0,
  initial_pars = human_start$pars
)

estimates <- rbind(
  data.frame(estimator = "human only",  human_only$item_pars),
  data.frame(estimator = "MML lambda = 0.5", mixed_mml$item_pars),
  data.frame(estimator = "MML ability-risk",
             ability_tuned$best_fit$item_pars)
)
estimates$true_b <- true_pars$b[match(estimates$item, true_pars$item)]
estimates$true_a <- true_pars$a[match(estimates$item, true_pars$item)]

ggplot(estimates, aes(true_b, b, colour = estimator)) +
  geom_abline(slope = 1, intercept = 0, linewidth = 0.4) +
  geom_point(size = 2) +
  labs(x = "True difficulty", y = "Estimated difficulty", colour = NULL) +
  theme_minimal()

## ----uninformative-predictor--------------------------------------------------
set.seed(242424)
bad_pars    <- true_pars
bad_pars$a  <- pmax(0.05, abs(rnorm(n_items, 0, 0.1)))   # near-zero slopes
bad_pars$d  <- rnorm(n_items, 0, 2)                       # difficulties unrelated to truth

predicted_bad <- simulate_2pl(theta_human, bad_pars)
generated_bad <- simulate_2pl(rnorm(n_generated), bad_pars)

tuned_bad <- tune_lambda_ability_risk(
  observed     = observed,
  predicted    = predicted_bad,
  generated    = generated_bad,
  target_resp  = observed,
  initial_pars = human_start$pars,
  fit_fn       = fit_mixed_subjects_mml,
  control      = list(maxit = 200)
)

tuned_bad$best_lambda   # expect ~0: the useless LLM is correctly ignored

