Generating Small, Medium, and Large Datasets

Overview

This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.

Each dataset contains:

library(samplezoo)

Generate a small dataset (i.e., 100 rows)

data_small <- samplezoo("small")
head(data_small)
#>       norm   norm_2   norm_3 binom neg pois        exp      unif      beta
#> 1 52.69034 49.96545 53.35655     1   0    3  2.0434317 0.1129147 0.1540612
#> 2 81.28921 49.37707 45.78710     1   0    2 20.4089309 0.1427738 0.4809002
#> 3 37.87634 31.38593 44.36526     0   0    1 10.6642000 0.8522358 0.5184911
#> 4 57.29915 67.81063 39.12621     0   1    3  0.1972188 0.2909152 0.2841699
#> 5 57.90277 49.28486 20.16397     0   2    3  3.9988879 0.1521336 0.5906541
#> 6 52.92176 30.52901 39.68084     0   0    2  2.6398123 0.8283023 0.1771189
#>       gamma    chi_sq      t_dist    f_dist
#> 1 2.1331262 0.4091359  0.38334469 0.5132548
#> 2 8.4017675 1.4830168  0.06645704 1.9610566
#> 3 0.3149984 1.0915302  0.58173285 1.1829001
#> 4 0.6973711 0.6047910 -0.46067958 0.5318871
#> 5 6.8879225 1.2900952  0.09708759 1.5101296
#> 6 3.9811520 8.6402532  2.83688173 1.4700840

Generate a medium sized dataset (i.e., 1,000 rows)

data_medium <- samplezoo("medium")
head(data_medium)
#>       norm   norm_2   norm_3 binom neg pois        exp      unif      beta
#> 1 57.08602 47.86841 18.47513     0   0    2 10.6980421 0.4820095 0.2277996
#> 2 24.97982 74.53643 57.30413     1   0    5  0.9868764 0.5989633 0.1413982
#> 3 26.94944 62.89636 44.62021     1   1    0  3.1632984 0.4550301 0.2961892
#> 4 77.91185 66.34961 34.01029     1   0    5 44.2694052 0.5419444 0.4335306
#> 5 48.24955 66.94171 20.74053     0   0    1  4.2539453 0.2907816 0.3487935
#> 6 27.54323 47.04133 47.85437     0   5    2  8.9892236 0.2168603 0.4364966
#>        gamma    chi_sq     t_dist    f_dist
#> 1  4.8695576 10.450353 -0.7899664 0.4713585
#> 2  0.9266157  4.394429 -0.1796965 0.6520039
#> 3 13.2726575  0.943484  1.5466515 1.7783990
#> 4  5.4080187  6.585922  0.2512173 0.2533718
#> 5  1.9115269  2.599784 -0.2838543 1.3787694
#> 6  2.6004048  2.240470  0.1171521 0.7388637

Generate a large sized dataset (i.e., 10,000 rows)

data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 binom neg pois         exp        unif      beta
#> 1 49.90275 56.62359 49.235616     0   0    0  9.16426745 0.391779083 0.2506626
#> 2 46.20065 67.61841 72.543713     0   1    2  0.03060789 0.151821510 0.3834547
#> 3 77.88145 70.81872 43.319279     1   0    6 17.28685921 0.070600295 0.1159225
#> 4 55.48893 51.10649  8.591812     0   2    3  4.99532666 0.001219014 0.2920635
#> 5 50.62134 72.90647 47.956821     1   1    2 13.83761106 0.268229936 0.4366961
#> 6 35.76584 61.40471 23.218152     0   5    6 14.13727974 0.729345478 0.2617002
#>      gamma    chi_sq     t_dist    f_dist
#> 1 2.055949 12.880766  1.0066483 0.3354664
#> 2 6.361658 20.701630  0.9045218 0.8281796
#> 3 1.703692 12.032983 -1.9546264 0.9750920
#> 4 2.109311 11.745271  0.2308021 1.0365954
#> 5 1.968892  9.963203 -0.7003722 0.9779980
#> 6 6.069446 12.815971 -1.1093519 1.1923196

Adding Variation or Ensuring Reproducibility with set.seed()

To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 binom neg pois        exp       unif      beta
#> 1 41.59287 83.70725 23.274065     0   1    4  6.4048718 0.85600403 0.1653553
#> 2 46.54734 58.33188 35.588540     1   3    1 17.3009535 0.60848590 0.3882179
#> 3 73.38062 69.26961 -2.070295     0   0    3  5.3458905 0.04511189 0.2785217
#> 4 51.05763 54.31848  6.643849     1   2    8 13.8085160 0.69891376 0.3521530
#> 5 51.93932 62.25090 18.040743     0   2    6  0.7016051 0.92416072 0.3165094
#> 6 75.72597 71.31986  6.687576     0   0    1 12.5201690 0.12233989 0.6563864
#>       gamma    chi_sq     t_dist    f_dist
#> 1 2.9538585  7.043010 -0.2133706 1.9694189
#> 2 5.0453279 15.098221 -0.9622741 0.4872297
#> 3 4.4267940  5.747358  1.1208978 0.5503424
#> 4 0.5134084  9.698822  0.1267564 0.4970970
#> 5 2.2646161  7.493394  0.6761810 1.5805821
#> 6 2.5383052 10.504139 -0.1209223 1.3571306
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2     norm_3 binom neg pois        exp      unif      beta
#> 1 29.84718 68.13494  7.9885694     0   0    6  0.3781432 0.3005967 0.4006597
#> 2 59.32663 52.32066 21.2526086     0   0    8 17.2863235 0.6932615 0.1509395
#> 3 62.01312 62.47569 38.4789563     1   1    2 31.1729393 0.6073009 0.4831276
#> 4 29.16661 53.51086 -0.8656269     0   0    0 17.0081881 0.5068313 0.2763463
#> 5 39.28465 47.19406 47.7819258     0   0    0  2.2526162 0.1013541 0.1490147
#> 6 45.13908 63.33566 53.3620528     1   6    4  1.4136846 0.9065914 0.3080311
#>       gamma    chi_sq     t_dist    f_dist
#> 1 2.7496531 11.072262 -0.4046877 1.4013145
#> 2 2.4903396  9.648422 -0.7709046 0.3333817
#> 3 1.4651266  6.867532 -0.4342910 0.9866586
#> 4 0.8683201 10.147377 -0.4917028 0.4282699
#> 5 2.6443795 12.757989  0.6695139 0.3894010
#> 6 1.9639101 12.467940 -0.3040910 1.0731372