The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.

cheapr

R-CMD-check CRAN status Codecov test coverage

In cheapr, ‘cheap’ means fast and memory-efficient, and that’s exactly the philosophy that cheapr aims to follow.

Installation

You can install the development version of cheapr like so:

remotes::install_github("NicChr/cheapr")

Last-observation carried forward (minor optimisation)

num_na() is a useful function to efficiently return the number of NA values and can be used in a variety of problems.

Here is an example of a minor optimisation we can add to vctrs::vec_fill_missing to return x if x has zero or only NA values.

library(cheapr)
library(vctrs)
#> Warning: package 'vctrs' was built under R version 4.3.2
library(bench)

na_locf <- function(x){
  # num_na is recursive so we compare it to unlisted length
  if (num_na(x) %in% c(0, unlisted_length(x))){
    x
  } else {
    vec_fill_missing(x, direction = "down")
  }
}
x <- rep(NA, 10^6)
identical(x, na_locf(x))
#> [1] TRUE
mark(na_locf(x), vec_fill_missing(x, direction = "down"))
#> # A tibble: 2 × 6
#>   expression                           min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                      <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 "na_locf(x)"                       913µs  918.8µs     1081.        0B       0 
#> 2 "vec_fill_missing(x, direction…   2.69ms   2.92ms      329.    11.4MB     111.
mark(na_locf(x), vec_fill_missing(x, direction = "down"))
#> # A tibble: 2 × 6
#>   expression                            min  median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                       <bch:tm> <bch:t>     <dbl> <bch:byt>    <dbl>
#> 1 "na_locf(x)"                        913µs 919.6µs     1080.        0B       0 
#> 2 "vec_fill_missing(x, direction …   2.58ms   2.8ms      342.    11.4MB     219.

All the NA handling functions in cheapr can make use of multiple cores on your machine using openMP.

# 1 core by default
mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 num_na(x)        910µs  920.9µs     1081.        0B      0  
#> 2 sum(is.na(x))    980µs   1.09ms      880.    3.81MB     81.5
# 4 cores
options(cheapr.cores = 4)
mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 num_na(x)        257µs  269.7µs     3490.        0B      0  
#> 2 sum(is.na(x))    904µs   1.06ms      906.    3.81MB     77.6

Efficient NA counts by row/col

m <- matrix(x, ncol = 10^3)
# Number of NA values by row
mark(row_na_counts(m), 
     rowSums(is.na(m)))
#> # A tibble: 2 × 6
#>   expression             min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>        <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 row_na_counts(m)    1.16ms   3.32ms      318.   12.88KB      0  
#> 2 rowSums(is.na(m))   2.77ms   2.89ms      344.    3.82MB     33.5
# Number of NA values by col
mark(col_na_counts(m), 
     colSums(is.na(m)))
#> # A tibble: 2 × 6
#>   expression             min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>        <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 col_na_counts(m)     683µs  717.9µs     1289.   12.88KB      0  
#> 2 colSums(is.na(m))      2ms   2.07ms      479.    3.82MB     44.3

is_na is a multi-threaded alternative to is.na

x <- rnorm(10^6)
x[sample.int(10^6, 10^5)] <- NA
mark(is.na(x), is_na(x))
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 is.na(x)     1.07ms    1.1ms      841.    3.81MB     133.
#> 2 is_na(x)    524.1µs  563.4µs     1673.    3.82MB     199.

### posixlt method is much faster
hours <- as.POSIXlt(seq.int(0, length.out = 10^6, by = 3600),
                    tz = "UTC")
hours[sample.int(10^6, 10^5)] <- NA

mark(is.na(hours), is_na(hours))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 is.na(hours)    1.17s    1.17s     0.855   61.05MB    0.855
#> 2 is_na(hours)   5.04ms   5.28ms   180.       9.85MB    9.91

It differs in 2 regards:

# List example
is.na(list(NA, list(NA, NA), 10))
#> [1]  TRUE FALSE FALSE
is_na(list(NA, list(NA, NA), 10))
#> [1]  TRUE  TRUE FALSE

# Data frame example
df <- data.frame(x = c(1, NA, 3),
                 y = c(NA, NA, NA))
df
#>    x  y
#> 1  1 NA
#> 2 NA NA
#> 3  3 NA
is_na(df)
#> [1] FALSE  TRUE FALSE
is_na(df)
#> [1] FALSE  TRUE FALSE
# The below identity should hold
identical(is_na(df), row_na_counts(df) == ncol(df))
#> [1] TRUE

is_na and all the NA handling functions fall back on calling is.na() if no suitable method is found. This means that custom objects like vctrs rcrds and more are supported.

Cheap data frame summaries with overview

Inspired by the excellent skimr package, overview() is a cheaper alternative designed for larger data.

df <- data.frame(
  x = sample.int(100, 10^7, TRUE),
  y = factor_(sample(LETTERS, 10^7, TRUE)),
  z = rnorm(10^7)
)
overview(df, hist = TRUE)
#> obs: 10000000 
#> cols: 3 
#> 
#> ----- Numeric -----
#>   col   class n_missing p_complete n_unique  mean    p0   p25 p50  p75 p100
#> 1   x integer         0          1      100 50.49     1    26  50   75  100
#> 2   z numeric         0          1 10000000     0 -5.06 -0.67   0 0.67 5.08
#>    iqr    sd  hist
#> 1   49 28.86 ▇▇▇▇▇
#> 2 1.35     1 ▁▂▇▂▁
#> 
#> ----- Categorical -----
#>   col  class n_missing p_complete n_unique n_levels min max
#> 1   y factor         0          1       26       26   A   Z
mark(overview(df))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 1 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 overview(df)    972ms    972ms      1.03    76.3MB     1.03

Cheaper and consistent subsetting with sset

sset(iris, 1:5)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
sset(iris, 1:5, j = "Species")
#>   Species
#> 1  setosa
#> 2  setosa
#> 3  setosa
#> 4  setosa
#> 5  setosa

# sset always returns a data frame when input is a data frame

sset(iris, 1, 1) # data frame
#>   Sepal.Length
#> 1          5.1
iris[1, 1] # not a data frame
#> [1] 5.1

x <- sample.int(10^6, 10^4, TRUE)
y <- sample.int(10^6, 10^4, TRUE)
mark(sset(x, x %in_% y), sset(x, x %in% y), x[x %in% y])
#> # A tibble: 3 × 6
#>   expression              min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>         <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 sset(x, x %in_% y)   95.1µs    115µs     8361.    88.3KB     2.08
#> 2 sset(x, x %in% y)   178.7µs    227µs     4340.   285.5KB     6.71
#> 3 x[x %in% y]         136.5µs    201µs     4862.   324.6KB     7.73

sset uses an internal range-based subset when i is an ALTREP integer sequence of the form m:n.

mark(sset(df, 0:10^5), df[0:10^5, , drop = FALSE])
#> # A tibble: 2 × 6
#>   expression                      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                 <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 sset(df, 0:10^5)            311.3µs  392.6µs     2409.    1.53MB    17.7 
#> 2 df[0:10^5, , drop = FALSE]   6.59ms   7.04ms      141.    4.83MB     2.08

It also accepts negative indexes

mark(sset(df, -10^4:0), 
     df[-10^4:0, , drop = FALSE],
     check = FALSE) # The only difference is the row names
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#>   expression                       min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 sset(df, -10^4:0)             20.6ms   28.2ms     28.7      152MB    19.2 
#> 2 df[-10^4:0, , drop = FALSE]  554.9ms  554.9ms      1.80     776MB     7.21

The biggest difference between sset and [ is the way logical vectors are handled. The two main differences when i is a logical vector are:

# Examples with NAs
x <- c(1, 5, NA, NA, -5)
x[x > 0]
#> [1]  1  5 NA NA
sset(x, x > 0)
#> [1] 1 5

# Example with length(i) < length(x)
sset(x, TRUE)
#> Error in check_length(i, length(x)): i must have length 5

# This is equivalent 
x[TRUE]
#> [1]  1  5 NA NA -5
# to..
sset(x)
#> [1]  1  5 NA NA -5

Vector and data frame lags with lag_()


lag_(1:10, 3) # Lag(3)
#>  [1] NA NA NA  1  2  3  4  5  6  7
lag_(1:10, -3) # Lead(3)
#>  [1]  4  5  6  7  8  9 10 NA NA NA

# Using an example from data.table

DF <- data.frame(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5])

# Similar to data.table::shift()

lag_(DF, 1) # Lag 
#>   year        v1 v2   v3
#> 1   NA        NA NA <NA>
#> 2 2010 0.2463277  1    a
#> 3 2011 0.1108038  2    b
#> 4 2012 0.2035633  3    c
#> 5 2013 0.6307771  4    d
lag_(DF, -1) # Lead
#>   year        v1 v2   v3
#> 1 2011 0.1108038  2    b
#> 2 2012 0.2035633  3    c
#> 3 2013 0.6307771  4    d
#> 4 2014 0.1343190  5    e
#> 5   NA        NA NA <NA>

With lag_ we can update variables by reference, including entire data frames

# At the moment, shift() cannot do this
lag_(DF, set = TRUE)
#>   year        v1 v2   v3
#> 1   NA        NA NA <NA>
#> 2 2010 0.2463277  1    a
#> 3 2011 0.1108038  2    b
#> 4 2012 0.2035633  3    c
#> 5 2013 0.6307771  4    d

DF # Was updated by reference
#>   year        v1 v2   v3
#> 1   NA        NA NA <NA>
#> 2 2010 0.2463277  1    a
#> 3 2011 0.1108038  2    b
#> 4 2012 0.2035633  3    c
#> 5 2013 0.6307771  4    d

Greatest common divisor and smallest common multiple

gcd2(5, 25)
#> [1] 5
scm2(5, 6)
#> [1] 30

gcd(seq(5, 25, by = 5))
#> [1] 5
scm(seq(5, 25, by = 5))
#> [1] 300

x <- seq(1L, 1000000L, 1L)
mark(gcd(x))
#> # A tibble: 1 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 gcd(x)        1.2µs    1.3µs   650363.        0B     65.0
x <- seq(0, 10^6, 0.5)
mark(gcd(x))
#> # A tibble: 1 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 gcd(x)         52ms     52ms      19.2        0B        0

Creating many sequences

As an example, to create 3 sequences with different increments,
the usual approach might be to use lapply to loop through the increment values together with seq()

# Base R
increments <- c(1, 0.5, 0.1)
start <- 1
end <- 5
unlist(lapply(increments, \(x) seq(start, end, x)))
#>  [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4
#> [20] 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3
#> [39] 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0

In cheapr you can use seq_() which accepts vector arguments.

seq_(start, end, increments)
#>  [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4
#> [20] 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3
#> [39] 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0

Use add_id = TRUE to label the individual sequences.

seq_(start, end, increments, add_id = TRUE)
#>   1   1   1   1   1   2   2   2   2   2   2   2   2   2   3   3   3   3   3   3 
#> 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4 1.5 
#>   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
#> 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5 
#>   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
#> 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0

If you know the sizes of your sequences beforehand, use sequence_()

seq_sizes <- c(3, 5, 10)
sequence_(seq_sizes, from = 0, by = 1/3, add_id = TRUE) |> 
  enframe_()
#> # A tibble: 18 × 2
#>    name  value
#>    <chr> <dbl>
#>  1 1     0    
#>  2 1     0.333
#>  3 1     0.667
#>  4 2     0    
#>  5 2     0.333
#>  6 2     0.667
#>  7 2     1    
#>  8 2     1.33 
#>  9 3     0    
#> 10 3     0.333
#> 11 3     0.667
#> 12 3     1    
#> 13 3     1.33 
#> 14 3     1.67 
#> 15 3     2    
#> 16 3     2.33 
#> 17 3     2.67 
#> 18 3     3

You can also calculate the sequence sizes using seq_size()

seq_size(start, end, increments)
#> [1]  5  9 41

‘Cheaper’ Base R alternatives

which

# which()
x <- rep(TRUE, 10^6)
mark(cheapr_which = which_(x),
     base_which = which(x))
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_which   2.48ms   2.67ms      360.    3.81MB     2.07
#> 2 base_which     1.11ms    1.2ms      798.    7.63MB    15.0
x <- rep(FALSE, 10^6)
mark(cheapr_which = which_(x),
     base_which = which(x))
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_which    215µs    227µs     4221.        0B      0  
#> 2 base_which      456µs    459µs     2134.    3.81MB     16.8
x <- c(rep(TRUE, 5e05), rep(FALSE, 1e06))
mark(cheapr_which = which_(x),
     base_which = which(x))
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_which   1.44ms   1.57ms      620.    1.91MB     4.19
#> 2 base_which     1.02ms   1.06ms      900.    7.63MB    13.9
x <- c(rep(FALSE, 5e05), rep(TRUE, 1e06))
mark(cheapr_which = which_(x),
     base_which = which(x))
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_which    3.4ms    3.5ms      280.    3.81MB     2.07
#> 2 base_which     1.38ms   1.44ms      660.    9.54MB    14.5
x <- sample(c(TRUE, FALSE), 10^6, TRUE)
x[sample.int(10^6, 10^4)] <- NA
mark(cheapr_which = which_(x),
     base_which = which(x))
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_which   2.09ms   2.16ms      459.    1.89MB     2.06
#> 2 base_which     3.32ms   3.36ms      296.     5.7MB     4.15

factor

# factor()
x <- sample(seq(-10^3, 10^3, 0.01))
y <- do.call(paste0, expand.grid(letters, letters, letters, letters))
mark(cheapr_factor = factor_(x), 
     base_factor = factor(x))
#> # A tibble: 2 × 6
#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_factor   10.1ms   10.6ms     92.7     4.59MB        0
#> 2 base_factor      484ms  485.2ms      2.06   27.84MB        0
mark(cheapr_factor = factor_(x, order = FALSE), 
     base_factor = factor(x, levels = unique(x)))
#> # A tibble: 2 × 6
#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_factor   5.37ms   5.95ms    165.      1.53MB        0
#> 2 base_factor   751.12ms 751.12ms      1.33   22.79MB        0
mark(cheapr_factor = factor_(y), 
     base_factor = factor(y))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_factor 191.92ms 193.07ms     5.18     5.23MB    0    
#> 2 base_factor      2.99s    2.99s     0.335   54.35MB    0.335
mark(cheapr_factor = factor_(y, order = FALSE), 
     base_factor = factor(y, levels = unique(y)))
#> # A tibble: 2 × 6
#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_factor   6.89ms   7.96ms     126.     3.49MB     0   
#> 2 base_factor    44.43ms  47.66ms      19.9   39.89MB     2.21

intersect & setdiff

# intersect() & setdiff()
x <- sample.int(10^6, 10^5, TRUE)
y <- sample.int(10^6, 10^5, TRUE)
mark(cheapr_intersect = intersect_(x, y, dups = FALSE),
     base_intersect = intersect(x, y))
#> # A tibble: 2 × 6
#>   expression            min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_intersect    3.2ms   3.42ms      290.    1.18MB     0   
#> 2 base_intersect     4.44ms   4.65ms      211.    5.16MB     2.20
mark(cheapr_setdiff = setdiff_(x, y, dups = FALSE),
     base_setdiff = setdiff(x, y))
#> # A tibble: 2 × 6
#>   expression          min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_setdiff   3.31ms   3.65ms      275.    1.76MB     0   
#> 2 base_setdiff     4.71ms   4.95ms      198.    5.71MB     2.22

%in_% and %!in_%

mark(cheapr = x %in_% y,
     base = x %in% y)
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr        1.9ms   2.01ms      485.  781.34KB     0   
#> 2 base         2.68ms   2.82ms      350.    2.53MB     2.21
mark(cheapr = x %!in_% y,
     base = !x %in% y)
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr       1.93ms   2.04ms      488.  787.84KB     0   
#> 2 base         2.82ms   3.03ms      320.    2.91MB     2.21

cut.default

# cut.default()
x <- rnorm(10^7)
b <- seq(0, max(x), 0.2)
mark(cheapr_cut = cut_numeric(x, b), 
     base_cut = cut(x, b))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cheapr_cut    130ms    130ms      7.67    38.1MB     0   
#> 2 base_cut      444ms    445ms      2.25   267.1MB     2.25

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.