RVenn: An R package for set operations on multiple sets

The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.

Turgut Yigit Akyol

2019-07-18

Introduction

This tutorial shows how to use RVenn, a package for dealing with multiple sets. The base R functions (intersect, union and setdiff) only work with two sets. %>% can be used from magrittr but, for many sets this can be tedious. reduce function from purrr package also provides a solution, which is the function that is used for set operations in this package. The functions overlap, unite and discern abstract away the details, so one can just construct the universe and choose the sets to operate by index or set name. Further, by using ggvenn Venn diagram can be drawn for 2-3 sets. As you can notice from the name of the function, ggvenn is based on ggplot2, so it is a neat way to show the relationship among a reduced number sets. For many sets, it is much better to use UpSet or setmap function provided within this package. Finally, by using enrichment_test function, the p-value of an overlap between two sets can be calculated. Here, the usage of all these functions will be shown.

Creating toy data

This chunk of code will create 10 sets with sizes ranging from 5 to 25.

library(purrr)
library(RVenn)
library(ggplot2)

set.seed(42)
toy = map(sample(5:25, replace = TRUE, size = 10),
          function(x) sample(letters, size = x))
toy[1:3]  # First 3 of the sets.
#> [[1]]
#>  [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p"
#> 
#> [[2]]
#>  [1] "a" "u" "z" "e" "t" "m" "h" "i" "x" "q" "g" "o" "y" "s" "l" "p" "d"
#> [18] "j" "n" "f" "r" "v" "c" "k"
#> 
#> [[3]]
#>  [1] "g" "m" "q" "w" "x" "l" "v" "d" "e" "o" "u"

Construct the Venn object

toy = Venn(toy)

Set operations

Intersection

Intersection of all sets:

overlap(toy)
#> [1] "g"

Intersection of selected sets (chosen with set names or indices, respectively):

overlap(toy, c("Set_1", "Set_2", "Set_5", "Set_8"))
#> [1] "t" "o" "d" "v" "e" "g"

overlap(toy, c(1, 2, 5, 8))
#> [1] "t" "o" "d" "v" "e" "g"

Pairwise intersections

overlap_pairs(toy, slice = 1:4)
#> $Set_1...Set_2
#>  [1] "l" "r" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d" "y"
#> [18] "q" "v" "e" "g" "p"
#> 
#> $Set_1...Set_3
#>  [1] "l" "w" "u" "o" "m" "x" "d" "q" "v" "e" "g"
#> 
#> $Set_1...Set_4
#>  [1] "l" "w" "f" "k" "t" "u" "c" "i" "o" "s" "n" "a" "x" "d" "y" "q" "e"
#> [18] "g" "b" "p"
#> 
#> $Set_2...Set_3
#>  [1] "u" "e" "m" "x" "q" "g" "o" "l" "d" "v"
#> 
#> $Set_2...Set_4
#>  [1] "a" "u" "z" "e" "t" "h" "i" "x" "q" "g" "o" "y" "s" "l" "p" "d" "n"
#> [18] "f" "c" "k"
#> 
#> $Set_3...Set_4
#> [1] "g" "q" "w" "x" "l" "d" "e" "o" "u"

Union

Union of all sets:

unite(toy)
#>  [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p" "z" "h"

Union of selected sets (chosen with set names or indices, respectively):

unite(toy, c("Set_3", "Set_8"))
#>  [1] "g" "m" "q" "w" "x" "l" "v" "d" "e" "o" "u" "t"

unite(toy, c(3, 8))
#>  [1] "g" "m" "q" "w" "x" "l" "v" "d" "e" "o" "u" "t"

Pairwise unions

unite_pairs(toy, slice = 1:4)
#> $Set_1...Set_2
#>  [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p" "z" "h"
#> 
#> $Set_1...Set_3
#>  [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p"
#> 
#> $Set_1...Set_4
#>  [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p" "z" "h"
#> 
#> $Set_2...Set_3
#>  [1] "a" "u" "z" "e" "t" "m" "h" "i" "x" "q" "g" "o" "y" "s" "l" "p" "d"
#> [18] "j" "n" "f" "r" "v" "c" "k" "w"
#> 
#> $Set_2...Set_4
#>  [1] "a" "u" "z" "e" "t" "m" "h" "i" "x" "q" "g" "o" "y" "s" "l" "p" "d"
#> [18] "j" "n" "f" "r" "v" "c" "k" "b" "w"
#> 
#> $Set_3...Set_4
#>  [1] "g" "m" "q" "w" "x" "l" "v" "d" "e" "o" "u" "b" "k" "a" "z" "i" "s"
#> [18] "c" "h" "t" "f" "n" "p" "y"

Set difference

discern(toy, 1, 8)
#>  [1] "l" "r" "w" "f" "k" "c" "i" "j" "s" "n" "m" "a" "x" "y" "q" "b" "p"

discern(toy, "Set_1", "Set_8")
#>  [1] "l" "r" "w" "f" "k" "c" "i" "j" "s" "n" "m" "a" "x" "y" "q" "b" "p"

discern(toy, c(3, 4), c(7, 8))
#> [1] "q" "k" "c" "h"

Pairwise differences

discern_pairs(toy, slice = 1:4)
#> $Set_1...Set_2
#> [1] "w" "b"
#> 
#> $Set_1...Set_3
#>  [1] "r" "f" "k" "t" "c" "i" "j" "s" "n" "a" "y" "b" "p"
#> 
#> $Set_1...Set_4
#> [1] "r" "j" "m" "v"
#> 
#> $Set_2...Set_3
#>  [1] "a" "z" "t" "h" "i" "y" "s" "p" "j" "n" "f" "r" "c" "k"
#> 
#> $Set_2...Set_4
#> [1] "m" "j" "r" "v"
#> 
#> $Set_3...Set_4
#> [1] "m" "v"
#> 
#> $Set_2...Set_1
#> [1] "z" "h"
#> 
#> $Set_3...Set_1
#> character(0)
#> 
#> $Set_4...Set_1
#> [1] "z" "h"
#> 
#> $Set_3...Set_2
#> [1] "w"
#> 
#> $Set_4...Set_2
#> [1] "b" "w"
#> 
#> $Set_4...Set_3
#>  [1] "b" "k" "a" "z" "i" "s" "c" "h" "t" "f" "n" "p" "y"

Venn Diagram

For two sets:

ggvenn(toy, slice = c(1, 5))

For three sets:

ggvenn(toy, slice = c(3, 6, 8))

Heatmap

setmap(toy)

Without clustering

setmap(toy, element_clustering = FALSE, set_clustering = FALSE)

Enrichment test

er = enrichment_test(toy, 6, 7)
er$Significance
#> [1] 0.4981

qplot(er$Overlap_Counts, geom = "blank") +
  geom_histogram(fill = "lemonchiffon4", bins = 8, color = "black") +
  geom_vline(xintercept = length(overlap(toy, c(6, 7))), color = "firebrick2",
             size = 2, linetype = "dashed", alpha = 0.7) +
  ggtitle("Null Distribution") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_continuous(name = "Overlap Counts") +
  scale_y_continuous(name = "Frequency")

The test above, of course, is not very meaningful as we randomly created the sets; therefore, we get a high p-value. However, when you are working with actual data, e.g. to check if a motif is enriched in the promoter regions of the genes in a gene set, you can use this test. In that case, set1 will be the gene set of interest, set2 will be the all the genes that the motif is found in the genome and univ will be all the genes of a genome.

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.