Introduction

library(cubar)
library(ggplot2)

Common analyses

CDS sequence QC and basic manipulation.

# example data
yeast_cds
#> Loading required package: Biostrings
#> Loading required package: BiocGenerics
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
#>     as.data.frame, basename, cbind, colnames, dirname, do.call,
#>     duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
#>     lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
#>     pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
#>     tapply, union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> Loading required package: stats4
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:utils':
#> 
#>     findMatches
#> The following objects are masked from 'package:base':
#> 
#>     I, expand.grid, unname
#> Loading required package: IRanges
#> Loading required package: XVector
#> Loading required package: GenomeInfoDb
#> 
#> Attaching package: 'Biostrings'
#> The following object is masked from 'package:base':
#> 
#>     strsplit
#> DNAStringSet object of length 6600:
#>        width seq                                            names               
#>    [1]   471 ATGAGTTCCCGGTTTGCAAGAA...GATGTGGATATGGATGCGTAA YPL071C
#>    [2]   432 ATGTCTAGATCTGGTGTTGCTG...AGAGGCGCTGGTTCTCATTAA YLL050C
#>    [3]  2160 ATGTCTGGAATGGGTATTGCGA...GAGAGCCTTGCTGGAATATAG YMR172W
#>    [4]   663 ATGTCAGCACCTGCTCAAAACA...GAAGACGATGCTGATTTATAA YOR185C
#>    [5]  2478 ATGGATAACTTCAAAATTTACA...TATCAAAATGGCAGAAAATGA YLL032C
#>    ...   ... ...
#> [6596]  1902 ATGCCAGACAATCTATCATTAC...CACGAAAAGACTTTCATTTAA YBR021W
#> [6597]   138 ATGAGGGTTCTCCATGTTATGC...AAAAAAAAAAAAAAAAGATGA YDR320W-B
#> [6598]   360 ATGTTTATTCTAGCAGAGGTTT...AATGCCGCGCTGGACGATTAA YBR232C
#> [6599]  1704 ATGGCAAGCGAACAGTCCTCAC...TTCCCAAAGAGTTTTAATTGA YDL245C
#> [6600]   906 ATGTTGAATAGTTCAAGAAAAT...TACTCTTTTATCTTCAATTGA YBR024W
yeast_cds_qc <- check_cds(yeast_cds)

# convert a CDS to codon sequence
seq_to_codons(yeast_cds_qc[['YDR320W-B']])
#>  [1] "AGG" "GTT" "CTC" "CAT" "GTT" "ATG" "CTT" "TCT" "TTC" "CTA" "AAC" "TCA"
#> [13] "CTT" "CTT" "TTC" "CTC" "CCT" "ATC" "TGC" "TTT" "TGT" "TTA" "TTA" "CAG"
#> [25] "TTG" "AAG" "GCT" "ACT" "TGT" "GCC" "GTT" "CGT" "GTG" "AAA" "AAA" "TAC"
#> [37] "TCG" "ATG" "AAA" "AAA" "AAA" "AAA" "AAA" "AGA"

# convert a CDS to amino acid sequence
Biostrings::translate(yeast_cds_qc[['YDR320W-B']])
#> 44-letter AAString object
#> seq: RVLHVMLSFLNSLLFLPICFCLLQLKATCAVRVKKYSMKKKKKR

# get codon frequency
yeast_cf <- count_codons(yeast_cds_qc)

Get codon table and visualize

# get codon table for the standard genetic code
ctab <- get_codon_table(gcid = '1')

# plot possible codon and anticodon pairings
plot_ca_pairing(ctab)

Calculate effective number of codons (ENC)

# get enc
enc <- get_enc(yeast_cf)
head(enc)
#>  YPL071C  YLL050C  YMR172W  YOR185C  YLL032C  YBR225W 
#> 53.00343 45.06356 56.01914 50.84984 53.29440 53.82957

plot_dist <- function(x, xlab = 'values'){
    x <- stack(x)
    ggplot(x, aes(x = values)) +
        geom_histogram() +
        labs(x = xlab, y = 'Number of genes')
}

plot_dist(enc, 'ENC')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate fraction of optimal codons (Fop)

# estimate optimal codons
optimal_codons <- est_optimal_codons(yeast_cds_qc, codon_table = ctab)
optimal_codons
#>     aa_code amino_acid codon subfam anticodon codon_b1 codon_b2 codon_b3
#>  1:       A        Ala   GCT Ala_GC       AGC        G        C        T
#>  2:       A        Ala   GCC Ala_GC       GGC        G        C        C
#>  3:       A        Ala   GCA Ala_GC       TGC        G        C        A
#>  4:       A        Ala   GCG Ala_GC       CGC        G        C        G
#>  5:       R        Arg   AGA Arg_AG       TCT        A        G        A
#>  6:       R        Arg   AGG Arg_AG       CCT        A        G        G
#>  7:       R        Arg   CGT Arg_CG       ACG        C        G        T
#>  8:       R        Arg   CGC Arg_CG       GCG        C        G        C
#>  9:       R        Arg   CGA Arg_CG       TCG        C        G        A
#> 10:       R        Arg   CGG Arg_CG       CCG        C        G        G
#> 11:       N        Asn   AAT Asn_AA       ATT        A        A        T
#> 12:       N        Asn   AAC Asn_AA       GTT        A        A        C
#> 13:       D        Asp   GAT Asp_GA       ATC        G        A        T
#> 14:       D        Asp   GAC Asp_GA       GTC        G        A        C
#> 15:       C        Cys   TGT Cys_TG       ACA        T        G        T
#> 16:       C        Cys   TGC Cys_TG       GCA        T        G        C
#> 17:       Q        Gln   CAA Gln_CA       TTG        C        A        A
#> 18:       Q        Gln   CAG Gln_CA       CTG        C        A        G
#> 19:       E        Glu   GAA Glu_GA       TTC        G        A        A
#> 20:       E        Glu   GAG Glu_GA       CTC        G        A        G
#> 21:       G        Gly   GGT Gly_GG       ACC        G        G        T
#> 22:       G        Gly   GGC Gly_GG       GCC        G        G        C
#> 23:       G        Gly   GGA Gly_GG       TCC        G        G        A
#> 24:       G        Gly   GGG Gly_GG       CCC        G        G        G
#> 25:       H        His   CAT His_CA       ATG        C        A        T
#> 26:       H        His   CAC His_CA       GTG        C        A        C
#> 27:       I        Ile   ATT Ile_AT       AAT        A        T        T
#> 28:       I        Ile   ATC Ile_AT       GAT        A        T        C
#> 29:       I        Ile   ATA Ile_AT       TAT        A        T        A
#> 30:       L        Leu   CTT Leu_CT       AAG        C        T        T
#> 31:       L        Leu   CTC Leu_CT       GAG        C        T        C
#> 32:       L        Leu   CTA Leu_CT       TAG        C        T        A
#> 33:       L        Leu   CTG Leu_CT       CAG        C        T        G
#> 34:       L        Leu   TTA Leu_TT       TAA        T        T        A
#> 35:       L        Leu   TTG Leu_TT       CAA        T        T        G
#> 36:       K        Lys   AAA Lys_AA       TTT        A        A        A
#> 37:       K        Lys   AAG Lys_AA       CTT        A        A        G
#> 38:       M        Met   ATG Met_AT       CAT        A        T        G
#> 39:       F        Phe   TTT Phe_TT       AAA        T        T        T
#> 40:       F        Phe   TTC Phe_TT       GAA        T        T        C
#> 41:       P        Pro   CCT Pro_CC       AGG        C        C        T
#> 42:       P        Pro   CCC Pro_CC       GGG        C        C        C
#> 43:       P        Pro   CCA Pro_CC       TGG        C        C        A
#> 44:       P        Pro   CCG Pro_CC       CGG        C        C        G
#> 45:       S        Ser   AGT Ser_AG       ACT        A        G        T
#> 46:       S        Ser   AGC Ser_AG       GCT        A        G        C
#> 47:       S        Ser   TCT Ser_TC       AGA        T        C        T
#> 48:       S        Ser   TCC Ser_TC       GGA        T        C        C
#> 49:       S        Ser   TCA Ser_TC       TGA        T        C        A
#> 50:       S        Ser   TCG Ser_TC       CGA        T        C        G
#> 51:       T        Thr   ACT Thr_AC       AGT        A        C        T
#> 52:       T        Thr   ACC Thr_AC       GGT        A        C        C
#> 53:       T        Thr   ACA Thr_AC       TGT        A        C        A
#> 54:       T        Thr   ACG Thr_AC       CGT        A        C        G
#> 55:       W        Trp   TGG Trp_TG       CCA        T        G        G
#> 56:       Y        Tyr   TAT Tyr_TA       ATA        T        A        T
#> 57:       Y        Tyr   TAC Tyr_TA       GTA        T        A        C
#> 58:       V        Val   GTT Val_GT       AAC        G        T        T
#> 59:       V        Val   GTC Val_GT       GAC        G        T        C
#> 60:       V        Val   GTA Val_GT       TAC        G        T        A
#> 61:       V        Val   GTG Val_GT       CAC        G        T        G
#>     aa_code amino_acid codon subfam anticodon codon_b1 codon_b2 codon_b3
#>            coef          se     zvalue        pvalue        qvalue
#>  1: -0.08702058 0.001295945 -67.148380  0.000000e+00  0.000000e+00
#>  2: -0.01876569 0.001410424 -13.304998  2.164990e-40  2.401170e-40
#>  3:  0.08612405 0.001651786  52.139941  0.000000e+00  0.000000e+00
#>  4:  0.13245286 0.002897501  45.712791  0.000000e+00  0.000000e+00
#>  5: -0.13023392 0.002699024 -48.252221  0.000000e+00  0.000000e+00
#>  6:  0.13023392 0.002699024  48.252221  0.000000e+00  0.000000e+00
#>  7: -0.21009663 0.004145530 -50.680276  0.000000e+00  0.000000e+00
#>  8:  0.06704651 0.004561998  14.696741  6.763720e-49  7.640498e-49
#>  9:  0.15756282 0.005211268  30.235027 8.209457e-201 1.221407e-200
#> 10:  0.17689904 0.006663392  26.547897 2.715861e-155 3.681501e-155
#> 11:  0.05752254 0.001573677  36.552951 1.600668e-292 2.871787e-292
#> 12: -0.05752254 0.001573677 -36.552951 1.600668e-292 2.871787e-292
#> 13:  0.01846525 0.001472061  12.543809  4.298281e-36  4.599915e-36
#> 14: -0.01846525 0.001472061 -12.543809  4.298281e-36  4.599915e-36
#> 15: -0.10061399 0.003916089 -25.692469 1.418849e-145 1.841485e-145
#> 16:  0.10061399 0.003916089  25.692469 1.418849e-145 1.841485e-145
#> 17: -0.11402126 0.002431540 -46.892615  0.000000e+00  0.000000e+00
#> 18:  0.11402126 0.002431540  46.892615  0.000000e+00  0.000000e+00
#> 19: -0.08495228 0.001741618 -48.777798  0.000000e+00  0.000000e+00
#> 20:  0.08495228 0.001741618  48.777798  0.000000e+00  0.000000e+00
#> 21: -0.17298185 0.001750857 -98.798410  0.000000e+00  0.000000e+00
#> 22:  0.08353350 0.002104163  39.699151  0.000000e+00  0.000000e+00
#> 23:  0.12180433 0.002217761  54.922219  0.000000e+00  0.000000e+00
#> 24:  0.12756390 0.002957835  43.127455  0.000000e+00  0.000000e+00
#> 25:  0.03712113 0.002425445  15.304872  7.094173e-53  8.164992e-53
#> 26: -0.03712113 0.002425445 -15.304872  7.094173e-53  8.164992e-53
#> 27: -0.03925015 0.001369989 -28.649964 1.604694e-180 2.224690e-180
#> 28: -0.05077761 0.001444496 -35.152474 1.065528e-270 1.666596e-270
#> 29:  0.13424317 0.002071677  64.799283  0.000000e+00  0.000000e+00
#> 30: -0.02011263 0.002293289  -8.770210  1.783334e-18  1.843786e-18
#> 31:  0.08046211 0.003663906  21.960744 6.836972e-107 8.511333e-107
#> 32: -0.05325623 0.002230109 -23.880545 4.878789e-126 6.200127e-126
#> 33:  0.04648372 0.002636924  17.628011  1.501568e-69  1.795994e-69
#> 34:  0.05220574 0.001430483  36.495197 1.321618e-291 2.239408e-291
#> 35: -0.05220574 0.001430483 -36.495197 1.321618e-291 2.239408e-291
#> 36:  0.06772648 0.001287660  52.596564  0.000000e+00  0.000000e+00
#> 37: -0.06772648 0.001287660 -52.596564  0.000000e+00  0.000000e+00
#> 38:  0.00000000 0.000000000   0.000000  0.000000e+00  0.000000e+00
#> 39:  0.06277157 0.001697785  36.972620 3.154829e-299 6.207890e-299
#> 40: -0.06277157 0.001697785 -36.972620 3.154829e-299 6.207890e-299
#> 41:  0.01730090 0.001794160   9.642895  5.268072e-22  5.540558e-22
#> 42:  0.11661003 0.003042406  38.328227  0.000000e+00  0.000000e+00
#> 43: -0.11017659 0.001789079 -61.582856  0.000000e+00  0.000000e+00
#> 44:  0.14732013 0.003697757  39.840406  0.000000e+00  0.000000e+00
#> 45: -0.02198467 0.002834011  -7.757442  8.665973e-15  8.665973e-15
#> 46:  0.02198467 0.002834011   7.757442  8.665973e-15  8.665973e-15
#> 47: -0.06420661 0.001368870 -46.904818  0.000000e+00  0.000000e+00
#> 48: -0.03011306 0.001517386 -19.845347  1.208906e-87  1.474865e-87
#> 49:  0.06087806 0.001679651  36.244471 1.214411e-287 2.002138e-287
#> 50:  0.11453925 0.002712104  42.232621  0.000000e+00  0.000000e+00
#> 51: -0.05060232 0.001373491 -36.842131 3.909036e-297 7.451601e-297
#> 52: -0.05078605 0.001495682 -33.955116 1.025107e-252 1.563288e-252
#> 53:  0.05870383 0.001651019  35.556123 6.682231e-277 1.072674e-276
#> 54:  0.11718345 0.002706591  43.295591  0.000000e+00  0.000000e+00
#> 55:  0.00000000 0.000000000   0.000000  0.000000e+00  0.000000e+00
#> 56:  0.05766467 0.001944119  29.661085 2.440435e-193 3.462013e-193
#> 57: -0.05766467 0.001944119 -29.661085 2.440435e-193 3.462013e-193
#> 58: -0.06198191 0.001319203 -46.984348  0.000000e+00  0.000000e+00
#> 59: -0.05476620 0.001444046 -37.925516  0.000000e+00  0.000000e+00
#> 60:  0.10561631 0.002090615  50.519249  0.000000e+00  0.000000e+00
#> 61:  0.09082653 0.002097084  43.310870  0.000000e+00  0.000000e+00
#>            coef          se     zvalue        pvalue        qvalue

# get fop
fop <- get_fop(yeast_cds)
plot_dist(fop, 'Fop')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate Codon Adaptation Index (CAI)

# estimate RSCU of highly expressed genes
yeast_heg <- head(yeast_exp[order(-yeast_exp$fpkm), ], n = 500)
yeast_heg <- yeast_heg[yeast_heg$gene_id %in% rownames(yeast_cf), ]
rscu_heg <- est_rscu(yeast_cf[yeast_heg$gene_id, ], codon_table = ctab)

# calculate CAI of all genes
# note: CAI values are usually calculated based RSCU of highly expressed genes.
cai <- get_cai(yeast_cf, rscu = rscu_heg)
plot_dist(cai, xlab = 'CAI')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate tRNA Adaptation Index (tAI)

# get tRNA gene copy number from GtRNADB
path_gtrnadb <- 'http://gtrnadb.ucsc.edu/genomes/eukaryota/Scere3/sacCer3-mature-tRNAs.fa'
yeast_trna <- Biostrings::readRNAStringSet(path_gtrnadb)
trna_gcn <- table(data.table::tstrsplit(sub(' .*', '', names(yeast_trna)), '-')[[3]])
trna_gcn <- trna_gcn[names(trna_gcn) != 'NNN'] # copy of each anticodon

# calculate tRNA weight for each codon
trna_w <- est_trna_weight(trna_level = trna_gcn, codon_table = ctab)

# get tAI
tai <- get_tai(yeast_cf, trna_w = trna_w)
plot_dist(tai, 'tAI')
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.