The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
In cheapr, ‘cheap’ means fast and memory-efficient, and that’s exactly the philosophy that cheapr aims to follow.
You can install the development version of cheapr like so:
num_na()
is a useful function to efficiently return the number of NA
values and can be used in a variety of problems.
Here is an example of a minor optimisation we can add to vctrs::vec_fill_missing
to return x if x has zero or only NA
values.
library(cheapr)
library(vctrs)
#> Warning: package 'vctrs' was built under R version 4.3.2
library(bench)
na_locf <- function(x){
# num_na is recursive so we compare it to unlisted length
if (num_na(x) %in% c(0, unlisted_length(x))){
x
} else {
vec_fill_missing(x, direction = "down")
}
}
x <- rep(NA, 10^6)
identical(x, na_locf(x))
#> [1] TRUE
mark(na_locf(x), vec_fill_missing(x, direction = "down"))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 "na_locf(x)" 913µs 918.8µs 1081. 0B 0
#> 2 "vec_fill_missing(x, direction… 2.69ms 2.92ms 329. 11.4MB 111.
mark(na_locf(x), vec_fill_missing(x, direction = "down"))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 "na_locf(x)" 913µs 919.6µs 1080. 0B 0
#> 2 "vec_fill_missing(x, direction … 2.58ms 2.8ms 342. 11.4MB 219.
All the NA
handling functions in cheapr can make use of multiple cores on your machine using openMP.
# 1 core by default
mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 num_na(x) 910µs 920.9µs 1081. 0B 0
#> 2 sum(is.na(x)) 980µs 1.09ms 880. 3.81MB 81.5
# 4 cores
options(cheapr.cores = 4)
mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 num_na(x) 257µs 269.7µs 3490. 0B 0
#> 2 sum(is.na(x)) 904µs 1.06ms 906. 3.81MB 77.6
m <- matrix(x, ncol = 10^3)
# Number of NA values by row
mark(row_na_counts(m),
rowSums(is.na(m)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 row_na_counts(m) 1.16ms 3.32ms 318. 12.88KB 0
#> 2 rowSums(is.na(m)) 2.77ms 2.89ms 344. 3.82MB 33.5
# Number of NA values by col
mark(col_na_counts(m),
colSums(is.na(m)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 col_na_counts(m) 683µs 717.9µs 1289. 12.88KB 0
#> 2 colSums(is.na(m)) 2ms 2.07ms 479. 3.82MB 44.3
is_na
is a multi-threaded alternative to is.na
x <- rnorm(10^6)
x[sample.int(10^6, 10^5)] <- NA
mark(is.na(x), is_na(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(x) 1.07ms 1.1ms 841. 3.81MB 133.
#> 2 is_na(x) 524.1µs 563.4µs 1673. 3.82MB 199.
### posixlt method is much faster
hours <- as.POSIXlt(seq.int(0, length.out = 10^6, by = 3600),
tz = "UTC")
hours[sample.int(10^6, 10^5)] <- NA
mark(is.na(hours), is_na(hours))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(hours) 1.17s 1.17s 0.855 61.05MB 0.855
#> 2 is_na(hours) 5.04ms 5.28ms 180. 9.85MB 9.91
It differs in 2 regards:
NA
when either that element is an NA
value or it is a list containing only NA
values.is_na
returns a logical vector where TRUE
defines an empty row of only NA
values.# List example
is.na(list(NA, list(NA, NA), 10))
#> [1] TRUE FALSE FALSE
is_na(list(NA, list(NA, NA), 10))
#> [1] TRUE TRUE FALSE
# Data frame example
df <- data.frame(x = c(1, NA, 3),
y = c(NA, NA, NA))
df
#> x y
#> 1 1 NA
#> 2 NA NA
#> 3 3 NA
is_na(df)
#> [1] FALSE TRUE FALSE
is_na(df)
#> [1] FALSE TRUE FALSE
# The below identity should hold
identical(is_na(df), row_na_counts(df) == ncol(df))
#> [1] TRUE
is_na
and all the NA
handling functions fall back on calling is.na()
if no suitable method is found. This means that custom objects like vctrs rcrds and more are supported.
overview
Inspired by the excellent skimr package, overview()
is a cheaper alternative designed for larger data.
df <- data.frame(
x = sample.int(100, 10^7, TRUE),
y = factor_(sample(LETTERS, 10^7, TRUE)),
z = rnorm(10^7)
)
overview(df, hist = TRUE)
#> obs: 10000000
#> cols: 3
#>
#> ----- Numeric -----
#> col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100
#> 1 x integer 0 1 100 50.49 1 26 50 75 100
#> 2 z numeric 0 1 10000000 0 -5.06 -0.67 0 0.67 5.08
#> iqr sd hist
#> 1 49 28.86 ▇▇▇▇▇
#> 2 1.35 1 ▁▂▇▂▁
#>
#> ----- Categorical -----
#> col class n_missing p_complete n_unique n_levels min max
#> 1 y factor 0 1 26 26 A Z
mark(overview(df))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 overview(df) 972ms 972ms 1.03 76.3MB 1.03
sset
sset(iris, 1:5)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5.0 3.6 1.4 0.2 setosa
sset(iris, 1:5, j = "Species")
#> Species
#> 1 setosa
#> 2 setosa
#> 3 setosa
#> 4 setosa
#> 5 setosa
# sset always returns a data frame when input is a data frame
sset(iris, 1, 1) # data frame
#> Sepal.Length
#> 1 5.1
iris[1, 1] # not a data frame
#> [1] 5.1
x <- sample.int(10^6, 10^4, TRUE)
y <- sample.int(10^6, 10^4, TRUE)
mark(sset(x, x %in_% y), sset(x, x %in% y), x[x %in% y])
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(x, x %in_% y) 95.1µs 115µs 8361. 88.3KB 2.08
#> 2 sset(x, x %in% y) 178.7µs 227µs 4340. 285.5KB 6.71
#> 3 x[x %in% y] 136.5µs 201µs 4862. 324.6KB 7.73
sset
uses an internal range-based subset when i
is an ALTREP integer sequence of the form m:n.
mark(sset(df, 0:10^5), df[0:10^5, , drop = FALSE])
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, 0:10^5) 311.3µs 392.6µs 2409. 1.53MB 17.7
#> 2 df[0:10^5, , drop = FALSE] 6.59ms 7.04ms 141. 4.83MB 2.08
It also accepts negative indexes
mark(sset(df, -10^4:0),
df[-10^4:0, , drop = FALSE],
check = FALSE) # The only difference is the row names
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, -10^4:0) 20.6ms 28.2ms 28.7 152MB 19.2
#> 2 df[-10^4:0, , drop = FALSE] 554.9ms 554.9ms 1.80 776MB 7.21
The biggest difference between sset
and [
is the way logical vectors are handled. The two main differences when i
is a logical vector are:
NA
values are ignored, only the locations of TRUE
values are used.i
must be the same length as x
and is not recycled.# Examples with NAs
x <- c(1, 5, NA, NA, -5)
x[x > 0]
#> [1] 1 5 NA NA
sset(x, x > 0)
#> [1] 1 5
# Example with length(i) < length(x)
sset(x, TRUE)
#> Error in check_length(i, length(x)): i must have length 5
# This is equivalent
x[TRUE]
#> [1] 1 5 NA NA -5
# to..
sset(x)
#> [1] 1 5 NA NA -5
lag_()
lag_(1:10, 3) # Lag(3)
#> [1] NA NA NA 1 2 3 4 5 6 7
lag_(1:10, -3) # Lead(3)
#> [1] 4 5 6 7 8 9 10 NA NA NA
# Using an example from data.table
DF <- data.frame(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5])
# Similar to data.table::shift()
lag_(DF, 1) # Lag
#> year v1 v2 v3
#> 1 NA NA NA <NA>
#> 2 2010 0.2463277 1 a
#> 3 2011 0.1108038 2 b
#> 4 2012 0.2035633 3 c
#> 5 2013 0.6307771 4 d
lag_(DF, -1) # Lead
#> year v1 v2 v3
#> 1 2011 0.1108038 2 b
#> 2 2012 0.2035633 3 c
#> 3 2013 0.6307771 4 d
#> 4 2014 0.1343190 5 e
#> 5 NA NA NA <NA>
With lag_
we can update variables by reference, including entire data frames
# At the moment, shift() cannot do this
lag_(DF, set = TRUE)
#> year v1 v2 v3
#> 1 NA NA NA <NA>
#> 2 2010 0.2463277 1 a
#> 3 2011 0.1108038 2 b
#> 4 2012 0.2035633 3 c
#> 5 2013 0.6307771 4 d
DF # Was updated by reference
#> year v1 v2 v3
#> 1 NA NA NA <NA>
#> 2 2010 0.2463277 1 a
#> 3 2011 0.1108038 2 b
#> 4 2012 0.2035633 3 c
#> 5 2013 0.6307771 4 d
gcd2(5, 25)
#> [1] 5
scm2(5, 6)
#> [1] 30
gcd(seq(5, 25, by = 5))
#> [1] 5
scm(seq(5, 25, by = 5))
#> [1] 300
x <- seq(1L, 1000000L, 1L)
mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 1.2µs 1.3µs 650363. 0B 65.0
x <- seq(0, 10^6, 0.5)
mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 52ms 52ms 19.2 0B 0
As an example, to create 3 sequences with different increments,
the usual approach might be to use lapply to loop through the increment values together with seq()
# Base R
increments <- c(1, 0.5, 0.1)
start <- 1
end <- 5
unlist(lapply(increments, \(x) seq(start, end, x)))
#> [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4
#> [20] 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3
#> [39] 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
In cheapr you can use seq_()
which accepts vector arguments.
seq_(start, end, increments)
#> [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4
#> [20] 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3
#> [39] 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
Use add_id = TRUE
to label the individual sequences.
seq_(start, end, increments, add_id = TRUE)
#> 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3
#> 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4 1.5
#> 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#> 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5
#> 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#> 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
If you know the sizes of your sequences beforehand, use sequence_()
seq_sizes <- c(3, 5, 10)
sequence_(seq_sizes, from = 0, by = 1/3, add_id = TRUE) |>
enframe_()
#> # A tibble: 18 × 2
#> name value
#> <chr> <dbl>
#> 1 1 0
#> 2 1 0.333
#> 3 1 0.667
#> 4 2 0
#> 5 2 0.333
#> 6 2 0.667
#> 7 2 1
#> 8 2 1.33
#> 9 3 0
#> 10 3 0.333
#> 11 3 0.667
#> 12 3 1
#> 13 3 1.33
#> 14 3 1.67
#> 15 3 2
#> 16 3 2.33
#> 17 3 2.67
#> 18 3 3
You can also calculate the sequence sizes using seq_size()
# which()
x <- rep(TRUE, 10^6)
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 2.48ms 2.67ms 360. 3.81MB 2.07
#> 2 base_which 1.11ms 1.2ms 798. 7.63MB 15.0
x <- rep(FALSE, 10^6)
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 215µs 227µs 4221. 0B 0
#> 2 base_which 456µs 459µs 2134. 3.81MB 16.8
x <- c(rep(TRUE, 5e05), rep(FALSE, 1e06))
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 1.44ms 1.57ms 620. 1.91MB 4.19
#> 2 base_which 1.02ms 1.06ms 900. 7.63MB 13.9
x <- c(rep(FALSE, 5e05), rep(TRUE, 1e06))
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 3.4ms 3.5ms 280. 3.81MB 2.07
#> 2 base_which 1.38ms 1.44ms 660. 9.54MB 14.5
x <- sample(c(TRUE, FALSE), 10^6, TRUE)
x[sample.int(10^6, 10^4)] <- NA
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 2.09ms 2.16ms 459. 1.89MB 2.06
#> 2 base_which 3.32ms 3.36ms 296. 5.7MB 4.15
# factor()
x <- sample(seq(-10^3, 10^3, 0.01))
y <- do.call(paste0, expand.grid(letters, letters, letters, letters))
mark(cheapr_factor = factor_(x),
base_factor = factor(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 10.1ms 10.6ms 92.7 4.59MB 0
#> 2 base_factor 484ms 485.2ms 2.06 27.84MB 0
mark(cheapr_factor = factor_(x, order = FALSE),
base_factor = factor(x, levels = unique(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 5.37ms 5.95ms 165. 1.53MB 0
#> 2 base_factor 751.12ms 751.12ms 1.33 22.79MB 0
mark(cheapr_factor = factor_(y),
base_factor = factor(y))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 191.92ms 193.07ms 5.18 5.23MB 0
#> 2 base_factor 2.99s 2.99s 0.335 54.35MB 0.335
mark(cheapr_factor = factor_(y, order = FALSE),
base_factor = factor(y, levels = unique(y)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 6.89ms 7.96ms 126. 3.49MB 0
#> 2 base_factor 44.43ms 47.66ms 19.9 39.89MB 2.21
# intersect() & setdiff()
x <- sample.int(10^6, 10^5, TRUE)
y <- sample.int(10^6, 10^5, TRUE)
mark(cheapr_intersect = intersect_(x, y, dups = FALSE),
base_intersect = intersect(x, y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_intersect 3.2ms 3.42ms 290. 1.18MB 0
#> 2 base_intersect 4.44ms 4.65ms 211. 5.16MB 2.20
mark(cheapr_setdiff = setdiff_(x, y, dups = FALSE),
base_setdiff = setdiff(x, y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_setdiff 3.31ms 3.65ms 275. 1.76MB 0
#> 2 base_setdiff 4.71ms 4.95ms 198. 5.71MB 2.22
%in_%
and %!in_%
mark(cheapr = x %in_% y,
base = x %in% y)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.9ms 2.01ms 485. 781.34KB 0
#> 2 base 2.68ms 2.82ms 350. 2.53MB 2.21
mark(cheapr = x %!in_% y,
base = !x %in% y)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.93ms 2.04ms 488. 787.84KB 0
#> 2 base 2.82ms 3.03ms 320. 2.91MB 2.21
# cut.default()
x <- rnorm(10^7)
b <- seq(0, max(x), 0.2)
mark(cheapr_cut = cut_numeric(x, b),
base_cut = cut(x, b))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_cut 130ms 130ms 7.67 38.1MB 0
#> 2 base_cut 444ms 445ms 2.25 267.1MB 2.25
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.