The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.

Getting Started with staRburst

Introduction

staRburst makes it trivial to scale your parallel R code from your laptop to 100+ AWS workers. This vignette walks through setup and common usage patterns.

Installation

# Install from GitHub
remotes::install_github("yourname/starburst")

One-Time Setup

Before using staRburst, you need to configure AWS resources. This only needs to be done once.

library(starburst)

# Interactive setup wizard (takes ~2 minutes)
starburst_setup()

This will: - Validate your AWS credentials - Create an S3 bucket for data transfer - Create an ECR repository for Docker images - Set up ECS cluster and VPC resources - Check Fargate quotas and offer to request increases

Basic Usage

The simplest way to use staRburst is with the furrr package:

library(furrr)
library(starburst)

# Define your work
expensive_simulation <- function(i) {
  # Some computation that takes a few minutes
  results <- replicate(1000, {
    x <- rnorm(10000)
    mean(x^2)
  })
  mean(results)
}

# Local execution (single core)
plan(sequential)
system.time({
  results_local <- future_map(1:100, expensive_simulation)
})
#> ~16 minutes on typical laptop

# Cloud execution (50 workers)
plan(future_starburst, workers = 50)
system.time({
  results_cloud <- future_map(1:100, expensive_simulation)
})
#> ~2 minutes (including 45s startup)
#> Cost: ~$0.85

# Results are identical
identical(results_local, results_cloud)
#> [1] TRUE

Example 1: Monte Carlo Simulation

library(starburst)
library(furrr)

# Simulate portfolio returns
simulate_portfolio <- function(seed) {
  set.seed(seed)
  
  # Random walk for 252 trading days
  returns <- rnorm(252, mean = 0.0003, sd = 0.02)
  prices <- cumprod(1 + returns)
  
  list(
    final_value = prices[252],
    max_drawdown = max(cummax(prices) - prices) / max(prices),
    sharpe_ratio = mean(returns) / sd(returns) * sqrt(252)
  )
}

# Run 10,000 simulations on 100 workers
plan(future_starburst, workers = 100)

results <- future_map(1:10000, simulate_portfolio, .options = furrr_options(seed = TRUE))

# Analyze results
final_values <- sapply(results, `[[`, "final_value")
hist(final_values, breaks = 50, main = "Distribution of Portfolio Final Values")

# 95% confidence interval
quantile(final_values, c(0.025, 0.975))

Performance: - Local (single core): ~4 hours - Cloud (100 workers): ~3 minutes - Cost: ~$1.80

Example 2: Bootstrap Resampling

library(starburst)
library(furrr)

# Your data
data <- read.csv("my_data.csv")

# Bootstrap function
bootstrap_regression <- function(i, data) {
  # Resample with replacement
  boot_indices <- sample(nrow(data), replace = TRUE)
  boot_data <- data[boot_indices, ]
  
  # Fit model
  model <- lm(y ~ x1 + x2 + x3, data = boot_data)
  
  # Return coefficients
  coef(model)
}

# Run 10,000 bootstrap samples
plan(future_starburst, workers = 50)

boot_results <- future_map(1:10000, bootstrap_regression, data = data)

# Convert to matrix
boot_coefs <- do.call(rbind, boot_results)

# 95% confidence intervals for each coefficient
apply(boot_coefs, 2, quantile, probs = c(0.025, 0.975))

Example 3: Genomics Pipeline

library(starburst)
library(furrr)

# Process one sample
process_sample <- function(sample_id) {
  # Read from S3 (data already in cloud)
  fastq_path <- sprintf("s3://my-genomics-data/samples/%s.fastq", sample_id)
  data <- read_fastq(fastq_path)
  
  # Align reads
  aligned <- align_reads(data, reference = "hg38")
  
  # Call variants
  variants <- call_variants(aligned)
  
  # Return summary
  list(
    sample_id = sample_id,
    num_variants = nrow(variants),
    variants = variants
  )
}

# Process 1000 samples on 100 workers
sample_ids <- list.files("s3://my-genomics-data/samples/", pattern = ".fastq$")

plan(future_starburst, workers = 100)

results <- future_map(sample_ids, process_sample, .progress = TRUE)

# Combine results
all_variants <- do.call(rbind, lapply(results, `[[`, "variants"))

Performance: - Local (sequential): ~208 hours (8.7 days) - Cloud (100 workers): ~2 hours - Cost: ~$47

Working with Data

Data Already in S3

If your data is already in S3, workers can read it directly:

plan(future_starburst, workers = 50)

results <- future_map(file_list, function(file) {
  # Workers read directly from S3
  data <- read.csv(sprintf("s3://my-bucket/%s", file))
  process(data)
})

Uploading Local Data

For smaller datasets, you can pass data as arguments:

# Load data locally
data <- read.csv("local_file.csv")

# staRburst automatically uploads to S3 and distributes
plan(future_starburst, workers = 50)

results <- future_map(1:1000, function(i) {
  # Each worker gets a copy of 'data'
  bootstrap_analysis(data, i)
})

Large Data Optimization

For very large objects, pre-upload to S3:

# Upload once
large_data <- read.csv("huge_file.csv")
s3_path <- starburst_upload(large_data, "s3://my-bucket/large_data.rds")

# Workers read from S3
plan(future_starburst, workers = 100)

results <- future_map(1:1000, function(i) {
  # Read from S3 inside worker
  data <- readRDS(s3_path)
  process(data, i)
})

Cost Management

Estimate Costs

# Check cost before running
plan(future_starburst, workers = 100, cpu = 4, memory = "8GB")
#> Estimated cost: ~$3.50/hour

Set Cost Limits

# Set maximum cost per job
starburst_config(
  max_cost_per_job = 10,      # Don't start jobs that would cost >$10
  cost_alert_threshold = 5     # Warn when approaching $5
)

# Now jobs exceeding limit will error before starting
plan(future_starburst, workers = 1000)  # Would cost ~$35/hour
#> Error: Estimated cost ($35/hr) exceeds limit ($10/hr)

Track Actual Costs

plan(future_starburst, workers = 50)

results <- future_map(data, process)

#> Cluster runtime: 23 minutes
#> Total cost: $1.34

Quota Management

Check Your Quota

starburst_quota_status()
#> Fargate vCPU Quota: 100 / 100 used
#> Allows: ~25 workers with 4 vCPUs each
#>
#> Recommended: Request increase to 500 vCPUs

Request Quota Increase

starburst_request_quota_increase(vcpus = 500)
#> Requesting Fargate vCPU quota increase:
#>   Current: 100 vCPUs
#>   Requested: 500 vCPUs
#>
#> βœ“ Quota increase requested (Case ID: 12345678)
#> βœ“ AWS typically approves within 1-24 hours

Wave-Based Execution

If you request more workers than your quota allows, staRburst automatically uses wave-based execution:

# Quota allows 25 workers, but you request 100
plan(future_starburst, workers = 100, cpu = 4)

#> ⚠ Requested: 100 workers (400 vCPUs)
#> ⚠ Current quota: 100 vCPUs (allows 25 workers max)
#>
#> πŸ“‹ Execution plan:
#>   β€’ Running in 4 waves of 25 workers each
#>
#> πŸ’‘ Request quota increase to 500 vCPUs? [y/n]: y
#>
#> βœ“ Quota increase requested
#> ⚑ Starting wave 1 (25 workers)...

results <- future_map(1:1000, expensive_function)

#> ⚑ Wave 1: 100% complete (250 tasks)
#> ⚑ Wave 2: 100% complete (500 tasks)
#> ⚑ Wave 3: 100% complete (750 tasks)
#> ⚑ Wave 4: 100% complete (1000 tasks)

Troubleshooting

View Worker Logs

# View logs from most recent cluster
starburst_logs()

# View logs from specific task
starburst_logs(task_id = "abc-123")

# View last 100 log lines
starburst_logs(last_n = 100)

Check Cluster Status

starburst_status()
#> Active Clusters:
#>   β€’ starburst-xyz123: 50 workers running
#>   β€’ starburst-abc456: 25 workers running

Common Issues

Environment mismatch: Packages not found on workers

# Rebuild environment
starburst_rebuild_environment()

Task failures: Some tasks failing

# Check logs
starburst_logs(task_id = "failed-task-id")

# Often due to memory limits - increase worker memory
plan(future_starburst, workers = 50, memory = "16GB")  # Default is 8GB

Slow data transfer: Large objects taking too long

# Use Arrow for data frames
library(arrow)
write_parquet(my_data, "s3://bucket/data.parquet")

# Workers read Arrow
results <- future_map(1:100, function(i) {
  data <- read_parquet("s3://bucket/data.parquet")
  process(data, i)
})

Best Practices

1. Use for Right-Sized Workloads

βœ… Good: Each task takes >5 minutes

# 100 tasks, each takes 10 minutes
# Local: 1000 minutes, Cloud: ~10 minutes

❌ Bad: Each task takes <1 minute

# 10000 tasks, each takes 30 seconds
# Startup overhead (45s) dominates

2. Batch Small Tasks

Instead of:

# 10,000 tiny tasks
results <- future_map(1:10000, small_function)

Do:

# 100 batches of 100 tasks each
batches <- split(1:10000, ceiling(seq_along(1:10000) / 100))

results <- future_map(batches, function(batch) {
  lapply(batch, small_function)
})

# Flatten results
results <- unlist(results, recursive = FALSE)

3. Use S3 for Large Data

Don’t:

big_data <- read.csv("10GB_file.csv")  # Upload for every task
results <- future_map(1:1000, function(i) process(big_data, i))

Do:

# Upload once to S3
s3_path <- "s3://bucket/big_data.csv"
write.csv(big_data, s3_path)

# Workers read from S3
results <- future_map(1:1000, function(i) {
  data <- read.csv(s3_path)
  process(data, i)
})

4. Set Reasonable Limits

starburst_config(
  max_cost_per_job = 50,           # Prevent accidents
  cost_alert_threshold = 25        # Get warned early
)

5. Clean Up

# staRburst auto-cleans, but you can force it
plan(sequential)  # Switch back to local
# Old cluster resources are cleaned up automatically

Advanced: Custom Configuration

CPU and Memory

# High CPU, low memory (CPU-bound work)
plan(future_starburst, workers = 50, cpu = 8, memory = "16GB")

# Low CPU, high memory (memory-bound work)
plan(future_starburst, workers = 25, cpu = 4, memory = "32GB")

Timeout

# Increase timeout for long-running tasks (default 1 hour)
plan(future_starburst, workers = 10, timeout = 7200)  # 2 hours

Region

# Use specific region (default from config)
plan(future_starburst, workers = 50, region = "us-west-2")

Next Steps

Getting Help

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.