Generating Small, Medium, and Large Datasets

Overview

This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.

Each dataset contains:

library(samplezoo)

Generate a small dataset (i.e., 100 rows)

data_small <- samplezoo("small")
head(data_small)
#>       norm    norm2    norm3 binom neg pois       exp       unif      beta
#> 1 41.71856 67.81808 51.60422     0   0    4  8.238765 0.01567249 0.2862984
#> 2 31.96341 89.64689 50.78892     0   1    3 10.470693 0.71756647 0.4788767
#> 3 48.71022 56.99243 39.44686     0   0    2  9.908121 0.10850431 0.4231335
#> 4 38.95935 60.28838 89.35484     0   1    4 29.373273 0.10859841 0.1797686
#> 5 66.53657 70.18446 49.79334     0   0    3 14.571805 0.07231850 0.4307580
#> 6 29.90567 61.94870 62.33800     0   0    3 14.023376 0.96884193 0.2894295
#>       gamma     chisq     t_dist
#> 1 2.0357656 3.8404966  0.1096145
#> 2 1.4998904 0.7520087 -1.1145953
#> 3 0.6304285 1.2776924 -0.9743322
#> 4 0.8154457 2.0482300  1.6333500
#> 5 2.8349630 0.2202916 -0.9734399
#> 6 5.2960556 0.8458899 -1.0400458

Generate a medium sized dataset (i.e., 1,000 rows)

data_medium <- samplezoo("medium")
head(data_medium)
#>       norm    norm2     norm3 binom neg pois        exp       unif      beta
#> 1 67.22159 61.14397 49.961968     0   2    5  1.9228950 0.89606497 0.2071879
#> 2 74.68645 48.87572 25.035572     0   0    0 17.2112242 0.16478069 0.2869354
#> 3 68.00435 55.81918  7.540233     0   0    3 10.6444322 0.62566623 0.5055417
#> 4 73.26883 78.62319 52.538824     0   3    3 28.5985018 0.56992118 0.1980817
#> 5 57.08670 58.15019 38.365797     0   3    2 12.7014979 0.19209306 0.2835067
#> 6 60.95354 53.66263 70.510219     0   2    4  0.4664658 0.05130417 0.1220230
#>      gamma     chisq     t_dist
#> 1 3.347114 0.8901615  0.4740891
#> 2 3.067555 5.4367785  2.4862193
#> 3 4.274730 2.6444388 -1.0474002
#> 4 1.849901 9.1521045  1.8635388
#> 5 5.429515 5.9480054  0.1960331
#> 6 5.737468 3.2040122 -0.1397868

Generate a large sized dataset (i.e., 10,000 rows)

data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2     norm3 binom neg pois       exp      unif       beta
#> 1 68.01984 72.82804 35.643754     1   1    3 23.370730 0.8708623 0.40704110
#> 2 83.65588 66.43495  9.540925     0   1    3 35.770362 0.5868214 0.54959647
#> 3 59.74394 82.58383 34.650896     1   8    4  6.308334 0.1268779 0.17221645
#> 4 48.96896 61.84879 43.532524     0   0    5 18.686994 0.7612284 0.15184928
#> 5 72.53461 45.67266 35.700349     0   4    4  9.002839 0.8231389 0.09291141
#> 6 73.54867 79.28889 60.934396     0   0    2 21.668908 0.1522375 0.37017411
#>       gamma     chisq     t_dist
#> 1  1.562766  5.798502  0.8114000
#> 2 24.752146  6.996460  0.3805892
#> 3  7.371747  8.172433  0.6062020
#> 4  3.762310 12.004274  1.9925462
#> 5  3.106423 16.208775 -0.4742934
#> 6  1.528204 10.242665 -0.4805331

Adding Variation or Ensuring Reproducibility with set.seed()

To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2     norm3 binom neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065     0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540     0   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295     0   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849     0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743     0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576     0   1    4  6.363993 0.1442317 0.35908460
#>       gamma     chisq     t_dist
#> 1 6.9893762 10.286282 -0.3814568
#> 2 5.4087626  6.519658 -2.3409216
#> 3 1.2587867  8.011417 -0.4744159
#> 4 0.9871787 14.780626  0.4292511
#> 5 2.4021943  6.799788 -0.6692669
#> 6 4.2109032 17.858701 -0.3370763
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2      norm3 binom neg pois        exp      unif       beta
#> 1 29.84718 68.13494  7.9885694     0   0    5  3.4417303 0.8866347 0.05413307
#> 2 59.32663 52.32066 21.2526086     0   3    3  0.8114356 0.7976466 0.07195440
#> 3 62.01312 62.47569 38.4789563     0   2    6 46.8038907 0.6469920 0.22555129
#> 4 29.16661 53.51086 -0.8656269     0   1    5 11.6955326 0.2036753 0.71455809
#> 5 39.28465 47.19406 47.7819258     1   1    1  0.3535625 0.3653401 0.34619912
#> 6 45.13908 63.33566 53.3620528     1   1    2  4.5592136 0.7628573 0.25880522
#>       gamma     chisq     t_dist
#> 1 6.7914120  4.464348 -1.0150596
#> 2 3.0132520  8.062120  0.3262369
#> 3 4.7360954 10.969593  1.5141157
#> 4 5.1235878  6.249247  0.6432708
#> 5 6.6851637  4.358815  0.2025742
#> 6 0.3903841 20.019575  1.6257109