The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
The vcfppR package implements various useful functions for rapidly manipulating VCF/BCF files in R using the C++ API of vcfpp.h.
## install.package("vcfppR") ## from CRAN
remotes::install_github("Zilong-Li/vcfppR") ## from latest github
If you find it useful, please cite the paper
vcftable
: read VCF as tabular datavcftable
gives you fine control over what you want to extract from VCF/BCF files.
Read only SNP variants
library(vcfppR)
vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz"
res <- vcftable(vcffile, "chr21:1-5100000", vartype = "snps")
str(res)
Read only SNP variants with PL format and drop the INFO column in the VCF/BCF
vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz"
res <- vcftable(vcffile, "chr21:1-5100000", vartype = "snps", format = "PL", info = FALSE)
str(res)
Read only INDEL variants with DP format in the VCF/BCF
vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz"
res <- vcftable(vcffile, "chr21:1-5100000", vartype = "indels", format = "DP")
str(res)
vcfcomp
: compare two VCF files and report concordanceWant to investigate the concordance between two VCF files? vcfcomp
is the utility function you need!
Genotype correlation
vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz"
res <- vcfcomp(test = vcffile, truth = vcffile, region = "chr21:1-5100000", stats = "r2", formats = c('GT','GT'))
str(res)
Genotype F1 score
vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz"
res <- vcfcomp(test = vcffile, truth = vcffile, region = "chr21:1-5100000", stats = "f1")
str(res)
Genotype Non-Reference Concordance
vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz"
res <- vcfcomp(test = vcffile, truth = vcffile, region = "chr21:1-5100000", stats = "nrc")
str(res)
vcfsummary
: variants characterizationWant to summarize variants discovered by genotype caller e.g. GATK? vcfsummary
is the utility function you need!
Small variants
vcffile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz"
region <- "chr21:10000000-10010000"
res <- vcfsummary(vcffile, region)
str(res)
# get labels and do plottiing
ped <- read.table("https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/20130606_g1k_3202_samples_ped_population.txt", h=T)
ped <- ped[order(ped$Superpopulation),]
out <- sapply(unique(ped$Superpopulation), function(pop) {
id <- subset(ped, Superpopulation == pop)[,"SampleID"]
ord <- match(id, res$samples)
res$SNP[ord] + res$INDEL[ord]
})
boxplot(out, main = paste0("Average number of SNP & INDEL variants\nin region ", region))
Complex structure variants
svfile <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20210124.SV_Illumina_Integration/1KGP_3202.gatksv_svtools_novelins.freeze_V3.wAF.vcf.gz"
sv <- vcfsummary(svfile, svtype = TRUE, region = "chr20")
str(sv)
allsvs <- sv$summary[-1]
bar <- barplot(allsvs, ylim = c(0, 1.1*max(allsvs)),
main = "Variant Counts on chr20 (all SVs)")
There are two classes i.e. vcfreader and vcfwriter offering the full R-bindings of vcfpp.h. Check out the examples in the tests folder or refer to the manual.
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.