This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.
Each dataset contains:
Variables/columns from common distributions such as Normal, Binomial, Poisson, and others.
Adjustable sample sizes to meet needs.
data_small <- samplezoo("small")
head(data_small)
#> norm norm_2 norm_3 binom neg pois exp unif beta
#> 1 52.69034 49.96545 53.35655 1 0 3 2.0434317 0.1129147 0.1540612
#> 2 81.28921 49.37707 45.78710 1 0 2 20.4089309 0.1427738 0.4809002
#> 3 37.87634 31.38593 44.36526 0 0 1 10.6642000 0.8522358 0.5184911
#> 4 57.29915 67.81063 39.12621 0 1 3 0.1972188 0.2909152 0.2841699
#> 5 57.90277 49.28486 20.16397 0 2 3 3.9988879 0.1521336 0.5906541
#> 6 52.92176 30.52901 39.68084 0 0 2 2.6398123 0.8283023 0.1771189
#> gamma chi_sq t_dist f_dist
#> 1 2.1331262 0.4091359 0.38334469 0.5132548
#> 2 8.4017675 1.4830168 0.06645704 1.9610566
#> 3 0.3149984 1.0915302 0.58173285 1.1829001
#> 4 0.6973711 0.6047910 -0.46067958 0.5318871
#> 5 6.8879225 1.2900952 0.09708759 1.5101296
#> 6 3.9811520 8.6402532 2.83688173 1.4700840
data_medium <- samplezoo("medium")
head(data_medium)
#> norm norm_2 norm_3 binom neg pois exp unif beta
#> 1 57.08602 47.86841 18.47513 0 0 2 10.6980421 0.4820095 0.2277996
#> 2 24.97982 74.53643 57.30413 1 0 5 0.9868764 0.5989633 0.1413982
#> 3 26.94944 62.89636 44.62021 1 1 0 3.1632984 0.4550301 0.2961892
#> 4 77.91185 66.34961 34.01029 1 0 5 44.2694052 0.5419444 0.4335306
#> 5 48.24955 66.94171 20.74053 0 0 1 4.2539453 0.2907816 0.3487935
#> 6 27.54323 47.04133 47.85437 0 5 2 8.9892236 0.2168603 0.4364966
#> gamma chi_sq t_dist f_dist
#> 1 4.8695576 10.450353 -0.7899664 0.4713585
#> 2 0.9266157 4.394429 -0.1796965 0.6520039
#> 3 13.2726575 0.943484 1.5466515 1.7783990
#> 4 5.4080187 6.585922 0.2512173 0.2533718
#> 5 1.9115269 2.599784 -0.2838543 1.3787694
#> 6 2.6004048 2.240470 0.1171521 0.7388637
data_large <- samplezoo("large")
head(data_large)
#> norm norm_2 norm_3 binom neg pois exp unif beta
#> 1 49.90275 56.62359 49.235616 0 0 0 9.16426745 0.391779083 0.2506626
#> 2 46.20065 67.61841 72.543713 0 1 2 0.03060789 0.151821510 0.3834547
#> 3 77.88145 70.81872 43.319279 1 0 6 17.28685921 0.070600295 0.1159225
#> 4 55.48893 51.10649 8.591812 0 2 3 4.99532666 0.001219014 0.2920635
#> 5 50.62134 72.90647 47.956821 1 1 2 13.83761106 0.268229936 0.4366961
#> 6 35.76584 61.40471 23.218152 0 5 6 14.13727974 0.729345478 0.2617002
#> gamma chi_sq t_dist f_dist
#> 1 2.055949 12.880766 1.0066483 0.3354664
#> 2 6.361658 20.701630 0.9045218 0.8281796
#> 3 1.703692 12.032983 -1.9546264 0.9750920
#> 4 2.109311 11.745271 0.2308021 1.0365954
#> 5 1.968892 9.963203 -0.7003722 0.9779980
#> 6 6.069446 12.815971 -1.1093519 1.1923196
To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.
set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#> norm norm_2 norm_3 binom neg pois exp unif beta
#> 1 41.59287 83.70725 23.274065 0 1 4 6.4048718 0.85600403 0.1653553
#> 2 46.54734 58.33188 35.588540 1 3 1 17.3009535 0.60848590 0.3882179
#> 3 73.38062 69.26961 -2.070295 0 0 3 5.3458905 0.04511189 0.2785217
#> 4 51.05763 54.31848 6.643849 1 2 8 13.8085160 0.69891376 0.3521530
#> 5 51.93932 62.25090 18.040743 0 2 6 0.7016051 0.92416072 0.3165094
#> 6 75.72597 71.31986 6.687576 0 0 1 12.5201690 0.12233989 0.6563864
#> gamma chi_sq t_dist f_dist
#> 1 2.9538585 7.043010 -0.2133706 1.9694189
#> 2 5.0453279 15.098221 -0.9622741 0.4872297
#> 3 4.4267940 5.747358 1.1208978 0.5503424
#> 4 0.5134084 9.698822 0.1267564 0.4970970
#> 5 2.2646161 7.493394 0.6761810 1.5805821
#> 6 2.5383052 10.504139 -0.1209223 1.3571306
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#> norm norm_2 norm_3 binom neg pois exp unif beta
#> 1 29.84718 68.13494 7.9885694 0 0 6 0.3781432 0.3005967 0.4006597
#> 2 59.32663 52.32066 21.2526086 0 0 8 17.2863235 0.6932615 0.1509395
#> 3 62.01312 62.47569 38.4789563 1 1 2 31.1729393 0.6073009 0.4831276
#> 4 29.16661 53.51086 -0.8656269 0 0 0 17.0081881 0.5068313 0.2763463
#> 5 39.28465 47.19406 47.7819258 0 0 0 2.2526162 0.1013541 0.1490147
#> 6 45.13908 63.33566 53.3620528 1 6 4 1.4136846 0.9065914 0.3080311
#> gamma chi_sq t_dist f_dist
#> 1 2.7496531 11.072262 -0.4046877 1.4013145
#> 2 2.4903396 9.648422 -0.7709046 0.3333817
#> 3 1.4651266 6.867532 -0.4342910 0.9866586
#> 4 0.8683201 10.147377 -0.4917028 0.4282699
#> 5 2.6443795 12.757989 0.6695139 0.3894010
#> 6 1.9639101 12.467940 -0.3040910 1.0731372