MOCHA is intended for use after clustering and cell labeling, since all functions of MOCHA operate on per-sample basis within each cell population. MOCHA can be used with an ArchR Project as input, as well as with a generic input. This tutorial has two sections: demonstrating MOCHA with generic inputs - Importing from Signac and Importing from SnapATAC.
This vignette will follow the Signac tutorial to generate an Signac object and label cell types within it. If you already have a Signac object with cell types labelled, skip to section Extract Fragments from Signac
library(Signac)
library(Seurat)
library(SeuratDisk)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
set.seed(1234)
# Download files
system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5')
system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz')
system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz.tbi')
# Load in ATAC and RNA data
<- Read10X_h5("pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5")
counts <- "pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz" fragpath
# create a Seurat object containing the RNA adata
<- CreateSeuratObject(
pbmc counts = counts$`Gene Expression`,
assay = "RNA"
)
# create ATAC assay and add it to the object
"ATAC"]] <- CreateChromatinAssay(
pbmc[[counts = counts$Peaks,
sep = c(":", "-"),
fragments = fragpath,
annotation = annotation
)
DefaultAssay(pbmc) <- "ATAC"
<- NucleosomeSignal(pbmc)
pbmc <- TSSEnrichment(pbmc)
pbmc
<- subset(
pbmc x = pbmc,
subset = nCount_ATAC < 100000 &
< 25000 &
nCount_RNA > 1000 &
nCount_ATAC > 1000 &
nCount_RNA < 2 &
nucleosome_signal > 1
TSS.enrichment
)
# Transform RNA
DefaultAssay(pbmc) <- "RNA"
<- SCTransform(pbmc)
pbmc <- RunPCA(pbmc)
pbmc "percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") pbmc[[
Label cell types:
# load PBMC reference
system('wget https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat')
<- LoadH5Seurat("pbmc_multimodal.h5seurat") reference
DefaultAssay(pbmc) <- "SCT"
# transfer cell type labels from reference to query
<- FindTransferAnchors(
transfer_anchors reference = reference,
query = pbmc,
normalization.method = "SCT",
reference.reduction = "spca",
recompute.residuals = FALSE,
dims = 1:50
)
<- TransferData(
predictions anchorset = transfer_anchors,
refdata = reference$celltype.l2,
weight.reduction = pbmc[['pca']],
dims = 1:50
)
<- AddMetaData(
pbmc object = pbmc,
metadata = predictions
)
# set the cell identities to the cell type predictions
Idents(pbmc) <- "predicted.id"
# set a reasonable order for cell types to be displayed when plotting
levels(pbmc) <- c("CD4 Naive", "CD4 TCM", "CD4 CTL", "CD4 TEM", "CD4 Proliferating",
"CD8 Naive", "dnT",
"CD8 TEM", "CD8 TCM", "CD8 Proliferating", "MAIT", "NK", "NK_CD56bright",
"NK Proliferating", "gdT",
"Treg", "B naive", "B intermediate", "B memory", "Plasmablast",
"CD14 Mono", "CD16 Mono",
"cDC1", "cDC2", "pDC", "HSPC", "Eryth", "ASDC", "ILC", "Platelet")
saveRDS(pbmc, 'pbmc_Signac_tutorial.rds')
<- readRDS('pbmc_Signac_tutorial.rds')
pbmc
DefaultAssay(pbmc) <- 'ATAC'
<- Fragments(pbmc) fragObj
Signac’s training dataset only has one sample, so to simulate multiple samples, we will duplicate the data here. This should not be necessary with a large dataset.
<- merge(
full_pbmc
pbmc, y = c(pbmc, pbmc),
add.cell.ids = c('Sample1', 'Sample2', 'Sample3'),
project = 'DuplicateData'
)
For a larger dataset, you would interact over the fragObj list and extract each fragments.tsv.gz file.
Instead, we’re just duplicating data for the sake of this tutorial.
More importantly, you do need to modify the barcode in the fragment file so it matches the barcodes in Seurat’s metadata.
<- parallel::mclapply(1:3, function(x){
fragList <- read.table(GetFragmentData(fragObj[[1]]))
frags names(frags) <- c('chr', 'start', 'end', 'barcode', 'val')
$barcode <- paste("Sample",x,"_", frags$barcode, sep ='')
frags
fragsmc.cores = 3)
}, names(fragList) <- c('Sample1', 'Sample2', 'Sample3')
Generate your sample/cell type list by finding all combinations of Samples and Cell Populations
Here we must also rename the GRangesList to have names in the format
CellPopulation#Sample
.
<- apply(expand.grid(unique(pbmc@meta.data$predicted.id), names(fragList)), 1, paste, collapse = "#")
celltype_sample_list # Extract metadata, add the Sample column, as well as CellBarcode.
<- full_pbmc@meta.data
fullMeta $Sample = gsub("_.*","", rownames(full_pbmc@meta.data))
fullMeta$CellBarcode = gsub(".*_","", rownames(full_pbmc@meta.data))
fullMeta
# Change the CellPopulation_Sample format to CellPopulation#Sample for MOCHA
rownames(fullMeta) <- gsub("_","#", rownames(fullMeta))
<- pbapply::pblapply(cl = 30, celltype_sample_list, function(x){
CellType_GRanges <- gsub("#.*","", x)
celltype <- gsub(".*#","", x)
sample <- dplyr::filter(fullMeta, predicted.id == celltype, Sample == sample)
sortedMeta <- dplyr::filter(fragList[[sample]], barcode %in% rownames(sortedMeta))
sortedFrags makeGRangesFromDataFrame(sortedFrags, keep.extra.columns = TRUE)
})
names(CellType_GRanges) <- celltype_sample_list
Calculate the study signal (the median number of fragments per cell)
<- lapply(fragList, function(x){
avg_reads <- dplyr::filter(as.data.frame(x), barcode %in% rownames(fullMeta))
filtFrag as.vector(table(filtFrag$barcode))
})<- median(unlist(avg_reads)) studySignal
# Our blacklist comes included with Signac
<- blacklist_hg38_unified
blackList # Call Open Tiles
<- MOCHA::callOpenTiles(ATACFragments = CellType_GRanges,
tileResults cellColData = fullMeta,
blackList = blackList,
genome = 'BSgenome.Hsapiens.UCSC.hg38',
cellPopLabel = 'predicted.id',
cellPopulations = fullMeta$predicted.id,
studySignal = studySignal,
cellCol = 'barcode',
TxDb = "TxDb.Hsapiens.UCSC.hg38.refGene",
Org = "org.Hs.eg.db",
outDir = paste(getwd(),'/MOCHA_Out', sep = ''), numCores= 35)
<- MOCHA::getSampleTileMatrix(tileResults, threshold = 0.2, numCores = 3, verbose = TRUE) TSAM
In this example, we follow the SnapATAC 10X PBMC tutorial through the clustering step before extracting the fragments and cell metadata necessary for MOCHA.
If you have a fully-formed Snap file for your analysis with
clustering results and sample information added to the metadata, skip to
section Formatting Snap File Metadata. If
following the vignette, we STRONGLY recommending installing this patched
version of SnapATAC from this repository with
devtools::install_github("imran-aifi/SnapATAC")
.
Download the all the SnapATAC Tutorial Data (linked above) to your working directory, and load it:
# CLI tool for installing from Google Drive share links
pip install gdown
# https://drive.google.com/file/d/1YiYd_Ydes3tqsJGpNuqQquUOoVj2EEjE/view?usp=share_link
gdown https://drive.google.com/uc?id=1YiYd_Ydes3tqsJGpNuqQquUOoVj2EEjE
# https://drive.google.com/file/d/1NvGn4M2_HD06PL5Nj2if5xO-uVA0y8Q5/view?usp=share_link
gdown https://drive.google.com/uc?id=1NvGn4M2_HD06PL5Nj2if5xO-uVA0y8Q5
# https://drive.google.com/file/d/1LUOqsXoQN6lVx-4RNlgH90e5oXQ0y9Bd/view?usp=share_link
gdown https://drive.google.com/uc?id=1LUOqsXoQN6lVx-4RNlgH90e5oXQ0y9Bd
# https://drive.google.com/file/d/1oMJ6wFsfS-q-sY_yaLEtnYebM7RrBix6/view?usp=share_link
gdown https://drive.google.com/uc?id=1oMJ6wFsfS-q-sY_yaLEtnYebM7RrBix6
# https://drive.google.com/file/d/1SEFZ5CJgcmoAkmo4_1kCMYS60YOFP379/view?usp=share_link
gdown https://drive.google.com/uc?id=1SEFZ5CJgcmoAkmo4_1kCMYS60YOFP379
# https://drive.google.com/file/d/1RlBvTCqz6mhaTAkfYiCdeojD-U2mN2wp/view?usp=share_link
gdown https://drive.google.com/uc?id=1RlBvTCqz6mhaTAkfYiCdeojD-U2mN2wp
library(SnapATAC);
= c(
snap.files "atac_pbmc_5k_nextgem.snap",
"atac_pbmc_10k_nextgem.snap"
);= c(
sample.names "PBMC 5K",
"PBMC 10K"
);= c(
barcode.files "atac_pbmc_5k_nextgem_singlecell.csv",
"atac_pbmc_10k_nextgem_singlecell.csv"
);= lapply(seq(snap.files), function(i){
x.sp.ls createSnap(
file=snap.files[i],
sample=sample.names[i]
);
})names(x.sp.ls) = sample.names;
= lapply(seq(snap.files), function(i){
barcode.ls = read.csv(
barcodes
barcode.files[i], head=TRUE
);= barcodes[2:nrow(barcodes),];
barcodes $logUMI = log10(barcodes$passed_filters + 1);
barcodes$promoter_ratio = (barcodes$promoter_region_fragments+1) / (barcodes$passed_filters + 1);
barcodes
barcodes
}) x.sp.ls
# for both datasets, we identify usable barcodes using [3.5-5] for log10(UMI) and [0.4-0.8] for promoter ratio as cutoff.
= c(3.5, 3.5);
cutoff.logUMI.low = c(5, 5);
cutoff.logUMI.high = c(0.4, 0.4);
cutoff.FRIP.low = c(0.8, 0.8);
cutoff.FRIP.high = lapply(seq(snap.files), function(i){
barcode.ls = barcode.ls[[i]];
barcodes = which(
idx $logUMI >= cutoff.logUMI.low[i] &
barcodes$logUMI <= cutoff.logUMI.high[i] &
barcodes$promoter_ratio >= cutoff.FRIP.low[i] &
barcodes$promoter_ratio <= cutoff.FRIP.high[i]
barcodes
);
barcodes[idx,]
});= lapply(seq(snap.files), function(i){
x.sp.ls = barcode.ls[[i]];
barcodes = x.sp.ls[[i]];
x.sp = intersect(x.sp@barcode, barcodes$barcode);
barcode.shared = x.sp[match(barcode.shared, x.sp@barcode),];
x.sp = barcodes[match(barcode.shared, barcodes$barcode),];
barcodes @metaData = barcodes;
x.sp
x.sp
})names(x.sp.ls) = sample.names;
x.sp.ls
# combine two snap object
= Reduce(snapRbind, x.sp.ls);
x.sp @metaData["Sample"] = x.sp@sample;
x.spprint(table(x.sp@sample))
x.sp
# Step 2. Add cell-by-bin matrix
= addBmatToSnap(x.sp, bin.size=5000);
x.sp # Step 3. Matrix binarization
= makeBinary(x.sp, mat="bmat");
x.sp # Step 4. Bin filtering
library(GenomicRanges);
= read.table("hg19.blacklist.bed.gz");
black_list = GRanges(
black_list.gr 1],
black_list[,IRanges(black_list[,2], black_list[,3])
);= queryHits(
idy findOverlaps(x.sp@feature, black_list.gr)
);if(length(idy) > 0){
= x.sp[,-idy, mat="bmat"];
x.sp
};
x.sp
# Remove unwanted chromosomes
= seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))];
chr.exclude = grep(paste(chr.exclude, collapse="|"), x.sp@feature);
idy if(length(idy) > 0){
= x.sp[,-idy, mat="bmat"]
x.sp
}; x.sp
# The coverage of bins roughly obeys a log normal distribution. We remove the top 5% bins that overlap with invariant features such as the house keeping gene promoters.
= log10(Matrix::colSums(x.sp@bmat)+1);
bin.cov hist(
> 0],
bin.cov[bin.cov xlab="log10(bin cov)",
main="log10(Bin Cov)",
col="lightblue",
xlim=c(0, 5)
);= quantile(bin.cov[bin.cov > 0], 0.95);
bin.cutoff = which(bin.cov <= bin.cutoff & bin.cov > 0);
idy = x.sp[, idy, mat="bmat"];
x.sp
x.sp
# We will further remove any cells of bin coverage less than 1,000. The rational behind this is that some cells may have high number of unique fragments but end up with low bin coverage after filtering. This step is optional but highly recommended.
= which(Matrix::rowSums(x.sp@bmat) > 1000);
idx = x.sp[idx,];
x.sp x.sp
# Step 5. Dimensionality reduction
<- density(
row.covs.dens x = x.sp@metaData[,"logUMI"],
bw = 'nrd', adjust = 1
);<- 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = x.sp@metaData[,"logUMI"])$y + .Machine$double.eps);
sampling_prob set.seed(1);
<- sort(sample(x = seq(nrow(x.sp)), size = 10000, prob = sampling_prob));
idx.landmark.ds = x.sp[idx.landmark.ds,];
x.landmark.sp = x.sp[-idx.landmark.ds,];
x.query.sp = runDiffusionMaps(
x.landmark.sp obj= x.landmark.sp,
input.mat="bmat",
num.eigs=50
);@metaData$landmark = 1;
x.landmark.sp= runDiffusionMapsExtension(
x.query.sp obj1=x.landmark.sp,
obj2=x.query.sp,
input.mat="bmat"
);@metaData$landmark = 0;
x.query.sp= snapRbind(x.landmark.sp, x.query.sp);
x.sp = x.sp[order(x.sp@metaData["sample"])];
x.sp = runKNN(
x.sp obj=x.sp,
eigs.dims=1:20,
k=15
);=runCluster(
x.spobj=x.sp,
tmp.folder=tempdir(),
louvain.lib="R-igraph", #"leiden" preferred, but may cause issues. Requires 'library(leiden)'.
seed.use=10,
resolution=0.7
)
Add the computed clusters to the Snap object metadata.
The Snap object contains two samples, “PBMC 5K” and “PBMC 10K”. Let’s add a “Sample” column to the metadata. Let’s also add a column “files” pointing to the original .snap files from which each cell came.
# Add clusters (from SnapATAC::runCluster) to metadata
@metaData$cluster = x.sp@cluster
x.sp
# Add Sample name to metadata (if not done previously)
@metaData$Sample = x.sp@sample
x.sp
# Add files to metadata, indicating the original snap file each cell belongs to.
<- c(
snap.files "atac_pbmc_5k_nextgem.snap",
"atac_pbmc_10k_nextgem.snap"
)<- unlist(lapply(x.sp@metaData$Sample, function(x){
fileList ifelse(x == "PBMC 5K", snap.files[[1]],snap.files[[2]])
}))@metaData$files <- fileList
x.sp
# SAVE this metadata to disk
write.csv(x.sp@metaData, "./snapMetadataforMOCHA.csv")
Now we have a Snap object with metadata containing barcodes, unique
cell ids (column cell_id), sample names, and cell populations (cluster).
We also have our HG19 blackList, black_list.gr
.
Note: Following the tutorial from SnapATAC can often result in a segfault when extracting fragments with
SnapATAC::extractReads
. We recommend running on a machine with large RAM and avoiding parallelization.
Next we extract reads by sample and cell population, ensuring our
final GRanges list is named following the pattern
CellPopulation#Sample
.
<- read.csv("./snapMetadataforMOCHA.csv")
snapMetadata <- "barcode"
cellCol <- "cluster"
cellPopLabel <- unique(snapMetadata$cluster)
cellPopulations <- unique(snapMetadata$Sample)
allSamples
<- unlist(lapply(allSamples, function(sample){
fragmentsGRangesList <- lapply(cellPopulations, function(cellPop) {
barcodesList $Sample == sample,]
snapMetadata[snapMetadata# Extract barcodes for a single cell population
<- snapMetadata$cluster == cellPop
cellPopIdx <- snapMetadata[cellPopIdx,]$barcode
cellPopBarcodes
# Build the file list for the selected cell barcode
<- snapMetadata[cellPopIdx,]$files
files
# Extract fragments
<- SnapATAC::extractReads(cellPopBarcodes, files, do.par = FALSE)
cellPopFrags
})names(barcodesList) <- paste(cellPopulations, sample, sep="#")
barcodesList }))
Calculate the study signal (the median number of fragments per cell)
<- lapply(fragmentsGRangesList, function(x){
avg_reads <- dplyr::filter(as.data.frame(x), barcode %in% snapMetadata$barcode)
filtFrag as.vector(table(filtFrag$barcode))
})<- median(unlist(avg_reads)) studySignal
# Call Open Tiles
<- MOCHA::callOpenTiles(
tileResults ATACFragments = fragmentsGRangesList,
cellColData = snapMetadata,
blackList = black_list.gr,
genome = "BSgenome.Hsapiens.UCSC.hg38",
cellPopLabel = cellPopLabel,
cellPopulations = cellPopulations,
studySignal = studySignal,
cellCol = cellCol,
TxDb = "TxDb.Hsapiens.UCSC.hg38.refGene",
Org = "org.Hs.eg.db",
outDir = paste(getwd(),'/MOCHA_Out', sep = ''),
numCores = 5
)
<- MOCHA::getSampleTileMatrix(tileResults, threshold = 0.2, numCores = 3, verbose = TRUE) TSAM