library(SNPkit)
library(snpStats)
#> Loading required package: survival
#> Loading required package: Matrix
library(methods)SNPkit provides a set of S4 tools for reading,
organising, summarising, filtering and exporting single nucleotide
polymorphism (SNP) genotype data. The central data structure is
SNPDataLong, which bundles a genotype matrix
(snpStats::SnpMatrix), a marker map
(data.frame), and metadata about the data source.
This vignette walks through the typical steps:
SNPDataLong object from a toy genotype
matrix.summary().qcSNPs().All file output uses tempdir() so the example does not
write to the user’s home filespace.
SNPDataLong objectWe simulate a tiny dataset with 10 individuals and 10 SNPs.
set.seed(123)
raw_mat <- matrix(
as.raw(sample(1:3, 100, replace = TRUE)),
nrow = 10, ncol = 10
)
rownames(raw_mat) <- paste0("ind", 1:10)
colnames(raw_mat) <- paste0("snp", 1:10)
geno <- new("SnpMatrix", raw_mat)
map <- data.frame(
Name = colnames(geno),
Chromosome = rep(1, 10),
Position = seq_len(10),
stringsAsFactors = FALSE
)
snp_data <- new(
"SNPDataLong",
geno = geno,
map = map,
path = tempfile(),
xref_path = "chip1"
)
snp_data
#> An object of class "SNPDataLong"
#> Slot "geno":
#> A SnpMatrix with 10 rows and 10 columns
#> Row names: ind1 ... ind10
#> Col names: snp1 ... snp10
#>
#> Slot "map":
#> Name Chromosome Position
#> 1 snp1 1 1
#> 2 snp2 1 2
#> 3 snp3 1 3
#> 4 snp4 1 4
#> 5 snp5 1 5
#> 6 snp6 1 6
#> 7 snp7 1 7
#> 8 snp8 1 8
#> 9 snp9 1 9
#> 10 snp10 1 10
#>
#> Slot "path":
#> [1] "/var/folders/gg/p6chdt1j0vsfdxppxd7lkvm00000gn/T//RtmpUDYrQc/filef0ca2f4aab22"
#>
#> Slot "xref_path":
#> [1] "chip1"The summary() method returns a
summary.SNPDataLong object that can be printed for a
human-readable description or queried programmatically.
s <- summary(snp_data)
s$n_individuals
#> [1] 10
s$n_snps
#> [1] 10
s$prop_missing
#> [1] 0
print(s)
#> Summary of SNPDataLong object
#> -----------------------------
#> Individuals : 10
#> SNPs : 10
#>
#> Missing data (NA):
#> - Total : 0 of 100
#> - Proportion: 0 %
#>
#> Distribution of SNPs by chromosome:
#> chr_info_clean
#> 1
#> 10
#>
#> SNPs with missing data by chromosome:
#> 1
#> 0qcSNPs() applies a flexible set of filters. The
action argument controls whether the function returns a
report of removed SNPs ("report"), a filtered
SNPDataLong ("filter"), or both.
filtered <- qcSNPs(
snp_data,
min_snp_cr = 0.8,
min_maf = 0.05,
snp_mono = TRUE,
no_position = TRUE,
action = "filter"
)
#>
#> +===============================+
#> | Quality Control on SNPs |
#> +===============================+
#> Initial number of SNPs: 10
#> Applying quality control filters...
#> - Call rate filter: 0 SNP(s) removed; 10 retained.
#> - MAF filter: 0 SNP(s) removed; 10 retained.
#> - No-position filter: 0 SNP(s) removed; 10 retained.
#> - Monomorphic filter: 0 SNP(s) removed; 10 retained.
filtered
#> An object of class "SNPDataLong"
#> Slot "geno":
#> A SnpMatrix with 10 rows and 10 columns
#> Row names: ind1 ... ind10
#> Col names: snp1 ... snp10
#>
#> Slot "map":
#> Name Chromosome Position
#> 1 snp1 1 1
#> 2 snp2 1 2
#> 3 snp3 1 3
#> 4 snp4 1 4
#> 5 snp5 1 5
#> 6 snp6 1 6
#> 7 snp7 1 7
#> 8 snp8 1 8
#> 9 snp9 1 9
#> 10 snp10 1 10
#>
#> Slot "path":
#> [1] "/var/folders/gg/p6chdt1j0vsfdxppxd7lkvm00000gn/T//RtmpUDYrQc/filef0ca2f4aab22"
#>
#> Slot "xref_path":
#> [1] "chip1"savePlink() and saveFImpute() write files
to a user-supplied directory. For this vignette we use
tempdir().
out_dir <- file.path(tempdir(), "snpkit_demo")
dir.create(out_dir, showWarnings = FALSE)
savePlink(
filtered,
path = out_dir,
name = "demo",
run_plink = FALSE,
chunk_size = 5
)
#>
#> +====================================+
#> | Saving Files in Plink Format |
#> +====================================+
#> Writing .ped file in chunks...
#> Wrote individuals 1 to 5
#> Wrote individuals 6 to 10
#> .ped file written: /var/folders/gg/p6chdt1j0vsfdxppxd7lkvm00000gn/T//RtmpUDYrQc/snpkit_demo/demo.ped
#> Writing .map file...
#> .map file written: /var/folders/gg/p6chdt1j0vsfdxppxd7lkvm00000gn/T//RtmpUDYrQc/snpkit_demo/demo.map
#> Skipping PLINK binary conversion as requested.
#> All done.
list.files(out_dir, pattern = "demo")
#> [1] "demo.map" "demo.ped"See ?qcSNPs, ?savePlink,
?saveFImpute, and ?runAnticlusteringPCA for
details on the individual functions. Functions that wrap external
software (FImpute, PLINK, ADMIXTURE) require the corresponding binary to
be installed on the system.