The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
This vignette demonstrates using autoFlagR for data
quality auditing in a healthcare context. We’ll work through a complete
example using simulated Electronic Health Records (EHR) data.
set.seed(123)
# Simulate healthcare data
n_patients <- 500
healthcare_data <- data.frame(
patient_id = 1:n_patients,
age = round(rnorm(n_patients, 55, 15)),
systolic_bp = round(rnorm(n_patients, 120, 15)),
diastolic_bp = round(rnorm(n_patients, 80, 10)),
cholesterol = round(rnorm(n_patients, 200, 40)),
glucose = round(rnorm(n_patients, 100, 20)),
bmi = round(rnorm(n_patients, 28, 5), 1),
gender = sample(c("Male", "Female"), n_patients, replace = TRUE),
diagnosis = sample(c("Hypertension", "Diabetes", "Normal"), n_patients, replace = TRUE, prob = c(0.3, 0.2, 0.5))
)
# Introduce known anomalies
healthcare_data$age[1:10] <- c(250, 180, 200, 190, 185, 175, 170, 165, 160, 155) # Impossible ages
healthcare_data$systolic_bp[11:15] <- c(300, 280, 290, 275, 285) # Extreme blood pressure
healthcare_data$cholesterol[16:20] <- c(600, 580, 590, 570, 585) # Very high cholesterol
healthcare_data$glucose[21:25] <- c(5, 3, 4, 2, 6) # Unrealistically low glucose
# Create ground truth labels and add to data
healthcare_data$is_anomaly_truth <- rep(FALSE, n_patients)
healthcare_data$is_anomaly_truth[1:25] <- TRUE # First 25 are anomalies
head(healthcare_data)
#> patient_id age systolic_bp diastolic_bp cholesterol glucose bmi gender
#> 1 1 250 111 70 167 90 24.6 Male
#> 2 2 180 105 70 188 105 30.9 Female
#> 3 3 200 135 80 164 89 24.5 Female
#> 4 4 190 131 79 225 124 25.3 Female
#> 5 5 185 97 55 245 103 31.9 Female
#> 6 6 175 119 90 285 88 25.6 Female
#> diagnosis is_anomaly_truth
#> 1 Hypertension TRUE
#> 2 Diabetes TRUE
#> 3 Normal TRUE
#> 4 Normal TRUE
#> 5 Hypertension TRUE
#> 6 Normal TRUE# Score anomalies using Isolation Forest
scored_data <- score_anomaly(
healthcare_data,
method = "iforest",
contamination = 0.05,
ground_truth_col = "is_anomaly_truth",
id_cols = "patient_id"
)
#> Warning in (function (data, sample_size = min(nrow(data), 10000L), ntrees =
#> 500, : Attempting to use more than 1 thread, but package was compiled without
#> OpenMP support. See
#> https://github.com/david-cortes/installing-optimized-libraries#4-macos-install-and-enable-openmp
# View summary statistics
summary(scored_data$anomaly_score)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.0000 0.6605 0.7630 0.7448 0.8583 1.0000# Plot anomaly score distribution
ggplot(flagged_data, aes(x = anomaly_score)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7, color = "black") +
geom_vline(xintercept = attr(flagged_data, "anomaly_threshold"),
color = "red", linetype = "dashed", linewidth = 1) +
labs(
title = "Distribution of Anomaly Scores",
x = "Anomaly Score",
y = "Frequency"
) +
theme_minimal()# Get top 10 anomalies
top_anomalies <- get_top_anomalies(flagged_data, n = 10)
# View top anomalies
top_anomalies[, c("patient_id", "age", "systolic_bp", "cholesterol",
"glucose", "anomaly_score", "is_anomaly")]
#> patient_id age systolic_bp cholesterol glucose anomaly_score is_anomaly
#> 1 239 60 119 191 104 1.0000000 TRUE
#> 2 171 52 112 179 99 0.9874545 TRUE
#> 3 51 59 133 199 96 0.9837984 TRUE
#> 4 299 55 123 218 87 0.9836844 TRUE
#> 5 59 57 117 158 104 0.9749733 TRUE
#> 6 267 53 139 207 108 0.9699068 TRUE
#> 7 290 58 110 202 113 0.9602628 TRUE
#> 8 277 58 120 222 86 0.9584102 TRUE
#> 9 48 48 130 225 100 0.9574119 TRUE
#> 10 79 58 97 198 90 0.9572089 TRUE# Extract benchmark metrics
if (!is.null(attr(scored_data, "benchmark_metrics"))) {
metrics <- extract_benchmark_metrics(scored_data)
cat("AUC-ROC:", metrics$auc_roc, "\n")
cat("AUC-PR:", metrics$auc_pr, "\n")
cat("Top-10 Recall:", metrics$top_k_recall$top_10, "\n")
cat("Top-50 Recall:", metrics$top_k_recall$top_50, "\n")
}
#> AUC-ROC: 0.9843368
#> AUC-PR: 0.02557768
#> Top-10 Recall: 0
#> Top-50 Recall: 0# Generate PDF audit report (saves to tempdir() by default)
generate_audit_report(
healthcare_data,
filename = "healthcare_audit_report",
output_dir = tempdir(),
output_format = "pdf",
method = "iforest",
contamination = 0.05,
ground_truth_col = "is_anomaly_truth",
id_cols = "patient_id"
)The report will include: - Executive summary with key metrics - Anomaly score distribution - Prioritized audit listing (heatmap) - Bivariate visualizations - Distribution comparisons - Benchmarking results (if ground truth provided)
This example demonstrated: 1. Creating and preprocessing healthcare data 2. Scoring anomalies using Isolation Forest 3. Flagging top anomalies for review 4. Visualizing results 5. Extracting benchmark metrics 6. Generating professional audit reports
For more details, see the Function Reference.
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.