CDS sequence QC and basic manipulation.
# example data
yeast_cds
#> Loading required package: Biostrings
#> Loading required package: BiocGenerics
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
#> as.data.frame, basename, cbind, colnames, dirname, do.call,
#> duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
#> lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
#> pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
#> tapply, union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> Loading required package: stats4
#>
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:utils':
#>
#> findMatches
#> The following objects are masked from 'package:base':
#>
#> I, expand.grid, unname
#> Loading required package: IRanges
#> Loading required package: XVector
#> Loading required package: GenomeInfoDb
#>
#> Attaching package: 'Biostrings'
#> The following object is masked from 'package:base':
#>
#> strsplit
#> DNAStringSet object of length 6600:
#> width seq names
#> [1] 471 ATGAGTTCCCGGTTTGCAAGAA...GATGTGGATATGGATGCGTAA YPL071C
#> [2] 432 ATGTCTAGATCTGGTGTTGCTG...AGAGGCGCTGGTTCTCATTAA YLL050C
#> [3] 2160 ATGTCTGGAATGGGTATTGCGA...GAGAGCCTTGCTGGAATATAG YMR172W
#> [4] 663 ATGTCAGCACCTGCTCAAAACA...GAAGACGATGCTGATTTATAA YOR185C
#> [5] 2478 ATGGATAACTTCAAAATTTACA...TATCAAAATGGCAGAAAATGA YLL032C
#> ... ... ...
#> [6596] 1902 ATGCCAGACAATCTATCATTAC...CACGAAAAGACTTTCATTTAA YBR021W
#> [6597] 138 ATGAGGGTTCTCCATGTTATGC...AAAAAAAAAAAAAAAAGATGA YDR320W-B
#> [6598] 360 ATGTTTATTCTAGCAGAGGTTT...AATGCCGCGCTGGACGATTAA YBR232C
#> [6599] 1704 ATGGCAAGCGAACAGTCCTCAC...TTCCCAAAGAGTTTTAATTGA YDL245C
#> [6600] 906 ATGTTGAATAGTTCAAGAAAAT...TACTCTTTTATCTTCAATTGA YBR024W
yeast_cds_qc <- check_cds(yeast_cds)
# convert a CDS to codon sequence
seq_to_codons(yeast_cds_qc[['YDR320W-B']])
#> [1] "AGG" "GTT" "CTC" "CAT" "GTT" "ATG" "CTT" "TCT" "TTC" "CTA" "AAC" "TCA"
#> [13] "CTT" "CTT" "TTC" "CTC" "CCT" "ATC" "TGC" "TTT" "TGT" "TTA" "TTA" "CAG"
#> [25] "TTG" "AAG" "GCT" "ACT" "TGT" "GCC" "GTT" "CGT" "GTG" "AAA" "AAA" "TAC"
#> [37] "TCG" "ATG" "AAA" "AAA" "AAA" "AAA" "AAA" "AGA"
# convert a CDS to amino acid sequence
Biostrings::translate(yeast_cds_qc[['YDR320W-B']])
#> 44-letter AAString object
#> seq: RVLHVMLSFLNSLLFLPICFCLLQLKATCAVRVKKYSMKKKKKR
# get codon frequency
yeast_cf <- count_codons(yeast_cds_qc)
Get codon table and visualize
# get codon table for the standard genetic code
ctab <- get_codon_table(gcid = '1')
# plot possible codon and anticodon pairings
plot_ca_pairing(ctab)
Calculate effective number of codons (ENC)
# get enc
enc <- get_enc(yeast_cf)
head(enc)
#> YPL071C YLL050C YMR172W YOR185C YLL032C YBR225W
#> 53.00343 45.06356 56.01914 50.84984 53.29440 53.82957
plot_dist <- function(x, xlab = 'values'){
x <- stack(x)
ggplot(x, aes(x = values)) +
geom_histogram() +
labs(x = xlab, y = 'Number of genes')
}
plot_dist(enc, 'ENC')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Calculate fraction of optimal codons (Fop)
# estimate optimal codons
optimal_codons <- est_optimal_codons(yeast_cds_qc, codon_table = ctab)
optimal_codons
#> aa_code amino_acid codon subfam anticodon codon_b1 codon_b2 codon_b3
#> 1: A Ala GCT Ala_GC AGC G C T
#> 2: A Ala GCC Ala_GC GGC G C C
#> 3: A Ala GCA Ala_GC TGC G C A
#> 4: A Ala GCG Ala_GC CGC G C G
#> 5: R Arg AGA Arg_AG TCT A G A
#> 6: R Arg AGG Arg_AG CCT A G G
#> 7: R Arg CGT Arg_CG ACG C G T
#> 8: R Arg CGC Arg_CG GCG C G C
#> 9: R Arg CGA Arg_CG TCG C G A
#> 10: R Arg CGG Arg_CG CCG C G G
#> 11: N Asn AAT Asn_AA ATT A A T
#> 12: N Asn AAC Asn_AA GTT A A C
#> 13: D Asp GAT Asp_GA ATC G A T
#> 14: D Asp GAC Asp_GA GTC G A C
#> 15: C Cys TGT Cys_TG ACA T G T
#> 16: C Cys TGC Cys_TG GCA T G C
#> 17: Q Gln CAA Gln_CA TTG C A A
#> 18: Q Gln CAG Gln_CA CTG C A G
#> 19: E Glu GAA Glu_GA TTC G A A
#> 20: E Glu GAG Glu_GA CTC G A G
#> 21: G Gly GGT Gly_GG ACC G G T
#> 22: G Gly GGC Gly_GG GCC G G C
#> 23: G Gly GGA Gly_GG TCC G G A
#> 24: G Gly GGG Gly_GG CCC G G G
#> 25: H His CAT His_CA ATG C A T
#> 26: H His CAC His_CA GTG C A C
#> 27: I Ile ATT Ile_AT AAT A T T
#> 28: I Ile ATC Ile_AT GAT A T C
#> 29: I Ile ATA Ile_AT TAT A T A
#> 30: L Leu CTT Leu_CT AAG C T T
#> 31: L Leu CTC Leu_CT GAG C T C
#> 32: L Leu CTA Leu_CT TAG C T A
#> 33: L Leu CTG Leu_CT CAG C T G
#> 34: L Leu TTA Leu_TT TAA T T A
#> 35: L Leu TTG Leu_TT CAA T T G
#> 36: K Lys AAA Lys_AA TTT A A A
#> 37: K Lys AAG Lys_AA CTT A A G
#> 38: M Met ATG Met_AT CAT A T G
#> 39: F Phe TTT Phe_TT AAA T T T
#> 40: F Phe TTC Phe_TT GAA T T C
#> 41: P Pro CCT Pro_CC AGG C C T
#> 42: P Pro CCC Pro_CC GGG C C C
#> 43: P Pro CCA Pro_CC TGG C C A
#> 44: P Pro CCG Pro_CC CGG C C G
#> 45: S Ser AGT Ser_AG ACT A G T
#> 46: S Ser AGC Ser_AG GCT A G C
#> 47: S Ser TCT Ser_TC AGA T C T
#> 48: S Ser TCC Ser_TC GGA T C C
#> 49: S Ser TCA Ser_TC TGA T C A
#> 50: S Ser TCG Ser_TC CGA T C G
#> 51: T Thr ACT Thr_AC AGT A C T
#> 52: T Thr ACC Thr_AC GGT A C C
#> 53: T Thr ACA Thr_AC TGT A C A
#> 54: T Thr ACG Thr_AC CGT A C G
#> 55: W Trp TGG Trp_TG CCA T G G
#> 56: Y Tyr TAT Tyr_TA ATA T A T
#> 57: Y Tyr TAC Tyr_TA GTA T A C
#> 58: V Val GTT Val_GT AAC G T T
#> 59: V Val GTC Val_GT GAC G T C
#> 60: V Val GTA Val_GT TAC G T A
#> 61: V Val GTG Val_GT CAC G T G
#> aa_code amino_acid codon subfam anticodon codon_b1 codon_b2 codon_b3
#> coef se zvalue pvalue qvalue
#> 1: -0.08702058 0.001295945 -67.148380 0.000000e+00 0.000000e+00
#> 2: -0.01876569 0.001410424 -13.304998 2.164990e-40 2.401170e-40
#> 3: 0.08612405 0.001651786 52.139941 0.000000e+00 0.000000e+00
#> 4: 0.13245286 0.002897501 45.712791 0.000000e+00 0.000000e+00
#> 5: -0.13023392 0.002699024 -48.252221 0.000000e+00 0.000000e+00
#> 6: 0.13023392 0.002699024 48.252221 0.000000e+00 0.000000e+00
#> 7: -0.21009663 0.004145530 -50.680276 0.000000e+00 0.000000e+00
#> 8: 0.06704651 0.004561998 14.696741 6.763720e-49 7.640498e-49
#> 9: 0.15756282 0.005211268 30.235027 8.209457e-201 1.221407e-200
#> 10: 0.17689904 0.006663392 26.547897 2.715861e-155 3.681501e-155
#> 11: 0.05752254 0.001573677 36.552951 1.600668e-292 2.871787e-292
#> 12: -0.05752254 0.001573677 -36.552951 1.600668e-292 2.871787e-292
#> 13: 0.01846525 0.001472061 12.543809 4.298281e-36 4.599915e-36
#> 14: -0.01846525 0.001472061 -12.543809 4.298281e-36 4.599915e-36
#> 15: -0.10061399 0.003916089 -25.692469 1.418849e-145 1.841485e-145
#> 16: 0.10061399 0.003916089 25.692469 1.418849e-145 1.841485e-145
#> 17: -0.11402126 0.002431540 -46.892615 0.000000e+00 0.000000e+00
#> 18: 0.11402126 0.002431540 46.892615 0.000000e+00 0.000000e+00
#> 19: -0.08495228 0.001741618 -48.777798 0.000000e+00 0.000000e+00
#> 20: 0.08495228 0.001741618 48.777798 0.000000e+00 0.000000e+00
#> 21: -0.17298185 0.001750857 -98.798410 0.000000e+00 0.000000e+00
#> 22: 0.08353350 0.002104163 39.699151 0.000000e+00 0.000000e+00
#> 23: 0.12180433 0.002217761 54.922219 0.000000e+00 0.000000e+00
#> 24: 0.12756390 0.002957835 43.127455 0.000000e+00 0.000000e+00
#> 25: 0.03712113 0.002425445 15.304872 7.094173e-53 8.164992e-53
#> 26: -0.03712113 0.002425445 -15.304872 7.094173e-53 8.164992e-53
#> 27: -0.03925015 0.001369989 -28.649964 1.604694e-180 2.224690e-180
#> 28: -0.05077761 0.001444496 -35.152474 1.065528e-270 1.666596e-270
#> 29: 0.13424317 0.002071677 64.799283 0.000000e+00 0.000000e+00
#> 30: -0.02011263 0.002293289 -8.770210 1.783334e-18 1.843786e-18
#> 31: 0.08046211 0.003663906 21.960744 6.836972e-107 8.511333e-107
#> 32: -0.05325623 0.002230109 -23.880545 4.878789e-126 6.200127e-126
#> 33: 0.04648372 0.002636924 17.628011 1.501568e-69 1.795994e-69
#> 34: 0.05220574 0.001430483 36.495197 1.321618e-291 2.239408e-291
#> 35: -0.05220574 0.001430483 -36.495197 1.321618e-291 2.239408e-291
#> 36: 0.06772648 0.001287660 52.596564 0.000000e+00 0.000000e+00
#> 37: -0.06772648 0.001287660 -52.596564 0.000000e+00 0.000000e+00
#> 38: 0.00000000 0.000000000 0.000000 0.000000e+00 0.000000e+00
#> 39: 0.06277157 0.001697785 36.972620 3.154829e-299 6.207890e-299
#> 40: -0.06277157 0.001697785 -36.972620 3.154829e-299 6.207890e-299
#> 41: 0.01730090 0.001794160 9.642895 5.268072e-22 5.540558e-22
#> 42: 0.11661003 0.003042406 38.328227 0.000000e+00 0.000000e+00
#> 43: -0.11017659 0.001789079 -61.582856 0.000000e+00 0.000000e+00
#> 44: 0.14732013 0.003697757 39.840406 0.000000e+00 0.000000e+00
#> 45: -0.02198467 0.002834011 -7.757442 8.665973e-15 8.665973e-15
#> 46: 0.02198467 0.002834011 7.757442 8.665973e-15 8.665973e-15
#> 47: -0.06420661 0.001368870 -46.904818 0.000000e+00 0.000000e+00
#> 48: -0.03011306 0.001517386 -19.845347 1.208906e-87 1.474865e-87
#> 49: 0.06087806 0.001679651 36.244471 1.214411e-287 2.002138e-287
#> 50: 0.11453925 0.002712104 42.232621 0.000000e+00 0.000000e+00
#> 51: -0.05060232 0.001373491 -36.842131 3.909036e-297 7.451601e-297
#> 52: -0.05078605 0.001495682 -33.955116 1.025107e-252 1.563288e-252
#> 53: 0.05870383 0.001651019 35.556123 6.682231e-277 1.072674e-276
#> 54: 0.11718345 0.002706591 43.295591 0.000000e+00 0.000000e+00
#> 55: 0.00000000 0.000000000 0.000000 0.000000e+00 0.000000e+00
#> 56: 0.05766467 0.001944119 29.661085 2.440435e-193 3.462013e-193
#> 57: -0.05766467 0.001944119 -29.661085 2.440435e-193 3.462013e-193
#> 58: -0.06198191 0.001319203 -46.984348 0.000000e+00 0.000000e+00
#> 59: -0.05476620 0.001444046 -37.925516 0.000000e+00 0.000000e+00
#> 60: 0.10561631 0.002090615 50.519249 0.000000e+00 0.000000e+00
#> 61: 0.09082653 0.002097084 43.310870 0.000000e+00 0.000000e+00
#> coef se zvalue pvalue qvalue
# get fop
fop <- get_fop(yeast_cds)
plot_dist(fop, 'Fop')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Calculate Codon Adaptation Index (CAI)
# estimate RSCU of highly expressed genes
yeast_heg <- head(yeast_exp[order(-yeast_exp$fpkm), ], n = 500)
yeast_heg <- yeast_heg[yeast_heg$gene_id %in% rownames(yeast_cf), ]
rscu_heg <- est_rscu(yeast_cf[yeast_heg$gene_id, ], codon_table = ctab)
# calculate CAI of all genes
# note: CAI values are usually calculated based RSCU of highly expressed genes.
cai <- get_cai(yeast_cf, rscu = rscu_heg)
plot_dist(cai, xlab = 'CAI')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Calculate tRNA Adaptation Index (tAI)
# get tRNA gene copy number from GtRNADB
path_gtrnadb <- 'http://gtrnadb.ucsc.edu/genomes/eukaryota/Scere3/sacCer3-mature-tRNAs.fa'
yeast_trna <- Biostrings::readRNAStringSet(path_gtrnadb)
trna_gcn <- table(data.table::tstrsplit(sub(' .*', '', names(yeast_trna)), '-')[[3]])
trna_gcn <- trna_gcn[names(trna_gcn) != 'NNN'] # copy of each anticodon
# calculate tRNA weight for each codon
trna_w <- est_trna_weight(trna_level = trna_gcn, codon_table = ctab)
# get tAI
tai <- get_tai(yeast_cf, trna_w = trna_w)
plot_dist(tai, 'tAI')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.