The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.

Generating Small, Medium, and Large Datasets

Overview

This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.

Each dataset contains:

library(samplezoo)

Generate a small dataset (i.e., 100 rows)

data_small <- samplezoo("small")
head(data_small)
#>       norm   norm_2   norm_3 bern neg pois       exp      unif       beta
#> 1 52.81412 53.51319 53.67912    0   4    1  2.930778 0.6989983 0.10578491
#> 2 68.23793 59.38779 35.13537    1   0    2  0.927297 0.2805017 0.07902199
#> 3 68.65313 58.44543 40.40566    0   0    1 34.820686 0.2691659 0.39371600
#> 4 45.62340 73.49855 61.86956    0   2    5 11.391594 0.6804970 0.46589920
#> 5 43.92391 37.85506 65.47060    0   0    1 21.108217 0.3427744 0.75404320
#> 6 51.66342 46.86237 81.70720    1   1    1  8.234561 0.3648130 0.14351144
#>       gamma    chi_sq     t_dist    f_dist
#> 1 3.3267537  7.212244 -0.8529863 0.5167607
#> 2 3.6669007  9.661967 -1.0643916 0.7294443
#> 3 4.1921663  5.173578  0.5679994 0.9108190
#> 4 0.8220897  8.708574  1.4667282 1.4952712
#> 5 2.0554174 19.556774 -0.6172805 0.1470397
#> 6 2.7243905 11.227294 -3.5974447 1.1166657

Generate a medium sized dataset (i.e., 1,000 rows)

data_medium <- samplezoo("medium")
head(data_medium)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif      beta
#> 1 46.02367 71.25660 39.750130    0   0    2 21.772330 0.9305956 0.4539395
#> 2 49.32905 58.94355 -3.378858    0   2    1 24.235364 0.7742999 0.2503492
#> 3 58.65559 69.64089  6.842902    1   2    2 27.303474 0.7920125 0.2215807
#> 4 29.89703 56.91901 70.248972    0   3    2 53.093483 0.6174459 0.2416861
#> 5 76.22624 59.05343 30.431970    1   3    4  1.065747 0.9747066 0.4487912
#> 6 40.68722 64.01339 12.307218    0   2    1  7.177614 0.7474789 0.4006164
#>       gamma    chi_sq     t_dist    f_dist
#> 1 4.9530141 11.196222 -0.8598328 3.2969429
#> 2 1.4745625  8.721485 -0.2513822 1.1522467
#> 3 0.4714032  7.562053  1.3909277 1.1831806
#> 4 2.3675387 16.463112  0.4024442 1.1443270
#> 5 4.0108882  5.337741  0.8392667 1.1098908
#> 6 4.8259632 11.386632 -0.2674480 0.6722363

Generate a large sized dataset (i.e., 10,000 rows)

data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2   norm_3 bern neg pois       exp       unif       beta
#> 1 68.17047 43.77534 32.77409    0   0    2  7.686341 0.07423565 0.22908364
#> 2 54.50925 57.31223 19.59262    0   0    3  5.295738 0.43575854 0.28826722
#> 3 34.23543 73.95832 35.53038    0   1    1 18.730670 0.07914885 0.02256758
#> 4 71.51849 66.28091 40.40233    1   0    8 18.031923 0.90119115 0.36785600
#> 5 54.46610 68.21688 36.06081    0   1    1  8.926137 0.90411665 0.57451053
#> 6 48.68553 60.46151 53.43388    1   1    2  2.153736 0.25749653 0.43182738
#>      gamma    chi_sq     t_dist    f_dist
#> 1 3.527325 14.116512  0.1701468 1.1050098
#> 2 1.410158  4.353722 -0.8447841 2.2619767
#> 3 2.300857 10.397276  2.1414233 1.6591481
#> 4 4.877539 12.756853  0.5519434 1.0611762
#> 5 2.961266 10.642384  0.6644180 0.8904096
#> 6 3.291894 12.879280 -0.3902477 3.0987640

Adding Variation or Ensuring Reproducibility with set.seed()

To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.

Reproducibility

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853
set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853

Variation

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2     norm_3 bern neg pois        exp      unif       beta
#> 1 29.84718 68.13494  7.9885694    0   0    5  3.4417303 0.8866347 0.05413307
#> 2 59.32663 52.32066 21.2526086    0   3    3  0.8114356 0.7976466 0.07195440
#> 3 62.01312 62.47569 38.4789563    0   2    6 46.8038907 0.6469920 0.22555129
#> 4 29.16661 53.51086 -0.8656269    0   1    5 11.6955326 0.2036753 0.71455809
#> 5 39.28465 47.19406 47.7819258    1   1    1  0.3535625 0.3653401 0.34619912
#> 6 45.13908 63.33566 53.3620528    1   1    2  4.5592136 0.7628573 0.25880522
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.7914120  4.464348 -1.0150596 2.2557295
#> 2 3.0132520  8.062120  0.3262369 1.4955877
#> 3 4.7360954 10.969593  1.5141157 1.0766901
#> 4 5.1235878  6.249247  0.6432708 1.1251542
#> 5 6.6851637  4.358815  0.2025742 0.4754946
#> 6 0.3903841 20.019575  1.6257109 0.6653886

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.