Introduction to ggESDA

Bo-Syue Jiang

2021-08-31

Symbolic data analysis (SDA) is an extension of standard data analysis where symbolic data tables are used as input and symbolic objects are made output as a result. The data units are called symbolic since they are more complex than standard ones, as they not only contain values or categories, but also include internal variation and structure.[1][2]

ggESDA is an extension of ggplot2 for visualizing the symbolic data based on exploratory data analysis (EDA).The package contains many useful functions for exploratory plots.Furthermore,the users can also transform the classical data into the symbolic data by the function in this package.

Package installation

devtools::install_github("kiangkiangkiang/ggESDA")

Creating

The example data is called “facedata” which is from RSDA package.It will be the interval form with minimal and maximal.However,most of the symbolic data are not exist at the beginning.Instead,they are usually aggregated by clustering method from a classical data.Thus,we will use classic2sym() function to transform classical data into symbolic data as the second example data.

library(ggESDA)
#aggregated by the variable Species in iris
iris_interval<-classic2sym(iris,groupby = "Species")$intervalData
iris_interval
#>             Sepal.Length   Sepal.Width  Petal.Length   Petal.Width
#> setosa     [4.30 : 5.80] [2.30 : 4.40] [1.00 : 1.90] [0.10 : 0.60]
#> versicolor [4.90 : 7.00] [2.00 : 3.40] [3.00 : 5.10] [1.00 : 1.80]
#> virginica  [4.90 : 7.90] [2.20 : 3.80] [4.50 : 6.90] [1.40 : 2.50]
class(iris_interval)
#> [1] "symbolic_tbl" "data.frame"
dim(iris_interval)
#> [1] 3 4

myFacedata<-RSDA::facedata
head(myFacedata,5)
#> # A tibble: 5 x 6
#>                  AD              BC                AH                DH
#>          <symblc_n>      <symblc_n>        <symblc_n>        <symblc_n>
#> 1 [155.00 : 157.00] [58.00 : 61.01] [100.45 : 103.28] [105.00 : 107.30]
#> 2 [154.00 : 160.01] [57.00 : 64.00] [101.98 : 105.55] [104.35 : 107.30]
#> 3 [154.01 : 161.00] [57.00 : 63.00]  [99.36 : 105.65] [101.04 : 109.04]
#> 4 [168.86 : 172.84] [58.55 : 63.39] [102.83 : 106.53] [122.38 : 124.52]
#> 5 [169.85 : 175.03] [60.21 : 64.38] [102.94 : 108.71] [120.24 : 124.52]
#> # ... with 2 more variables: EH <symblc_n>, GH <symblc_n>
class(myFacedata)
#> [1] "symbolic_tbl" "tbl_df"       "tbl"          "data.frame"
dim(myFacedata)
#> [1] 27  6

Visualization

With the symbolic data generated,you can start to visualize the symbolic data. But now, we are going to do with a real symbolic datasets, face recognition data and Environment data, by the following functions:

ggInterval_index() for visualizing the interval of each observations

ggInterval_index(facedata, aes(x = AD))

You can also change fill= and col= to make the plot more visible,and set x or y axis to your variable will rotate the index line in figure.

m <- mean(facedata$AD)
Concepts <- as.factor(rep(c("FRA", "HUS", "INC", "ISA", "JPL", "KHA",
                            "LOT", "PHI", "ROM"), each = 3))
ggInterval_index(facedata, aes(x = AD, fill = Concepts))+
  theme_bw() +
  scale_fill_brewer(palette = "Set2")+
  geom_segment(x = m, xend = m, y = 0, yend = 27,
               lty = 2, col = "red", lwd = 1) +
  geom_text(aes(x = m, y = 28), label = "Mean")+
  scale_y_continuous(breaks = 1:27,
                     labels = rownames(facedata))

ggInterval_minmax() for visualizing the interval of each observations which is sorted by minimal value


ggInterval_minmax(facedata, aes(x = AD, size = 3))+
  scale_color_manual(values = c("darkblue", "darkred")) + 
  coord_fixed(ratio = 1) 
  theme_bw()

ggInterval_boxplot() for visualizing the distribution of the interval data

The width of rectangle in boxplot is not meaningful.Instead,it is used for showing the difference of each quantile,either is the fill color.

ggInterval_boxplot(facedata, plotAll = T) +
  theme_bw()

You can also change the fill color and any aesthetic to which you like.

ggInterval_boxplot(data=myFacedata,aes(x=AD,col="black",lty=2,lwd=1.2))+
  scale_fill_manual(values = c("red","yellow",
                               "green","blue","grey"),
                    labels=c("0%","25%","50%","75%","100%"),
                    name="quantile")

ggInterval_hist() for visualizing the distribution of the interval data

The histogram of interval data is to calculate the frequency of each interval in bins,and in ggInterval_hist() function,there are two main parameter can be adjusted by user.One is the bins,just like the geom_histogram do,and the default of bins is 10.The other is the method,equal-bin and inequal-bin can be chosen,and the default of method is equal-bin.

Note : The change of bins in unequal-bin method will be ignore because it will conflict with algorithm .

equal_bin <- ggInterval_hist(facedata, plotAll = T) +
              theme_bw()
unequal_bin <- ggInterval_hist(facedata, plotAll = T,
                               method = "unequal-bin") +
              theme_bw()
ggarrange(equal_bin, unequal_bin, ncol = 2)  

ggInterval_centerRange for visualizing the relation between center and range of the interval of each observations.

ggInterval_centerRange(iris_interval,aes(x = Sepal.Width)) +
  geom_text(label = rownames(iris_interval), vjust = -0.8) +
  scale_x_continuous(limits = c(2.6, 3.4)) +
  scale_y_continuous(limits = c(1.3, 2.2))

ggInterval_centerRange(myFacedata[11:20, ],aes(x = GH))+
  geom_text(label = rownames(myFacedata)[11:20], vjust = -0.8, size =  3)

ggInterval_scatter for visualizing the the relation of two variables

Because the value is an interval,it will be a nice way that the rectangle represents interval of two variables.

myCol <- rep(RColorBrewer::brewer.pal(9, "Set1"), each = 3)
ggInterval_scatter(data = facedata, aes(x = AD, y = BC)) +
  scale_fill_manual(values = myCol, name = "CONCEPTS",
                    label = rownames(facedata)) +
  theme_bw()

ggInterval_scaMatrix for visualizing all variables relations by using scatter plot at a time

ggInterval_scaMatrix(facedata)

ggInterval_2Dhist for visualizing the two variable distribution by theirs frequency of interval bins

ggInterval_2Dhist(iris_interval, aes(x = Sepal.Length, y = Petal.Length))

It can be adjusted by ggplot2 function too.Using ?ggInterval_2Dhist() will show more detail about it.

ggInterval_2Dhist(facedata, aes(x = BC, y = AH, col = "gray50")) +
  scale_fill_gradient(
    low = "gray85",
    high = "red"
  ) +
  theme_bw()

ggInterval_2DhistMatrix for visualizing all variables relations by using 2Dhist plot at a time

Note : It isn’t recommended to deal with too many variables because the time complexity in calculating full matrix will be too large.

ggInterval_2DhistMatrix(facedata,
                        xBins = 10,
                        yBins = 10,
                        removeZero = T,
                        addFreq = F)

ggInterval_indexImage for visualizing the interval of each observations by using image

ggInterval_indexImage(facedata, aes(x = AD)) +
  coord_flip()

p1 <- ggInterval_indexImage(facedata, plotAll = T, column_condition = T,
                      full_strip = T)

p2 <- ggInterval_indexImage(facedata, plotAll = T, column_condition = F,
                      full_strip = T)

ggpubr::ggarrange(p1, p2, ncol = 2)

ggInterval_3Dscatter for visualizing the relation of three variables

ggInterval_3Dscatter(iris_interval, aes(Sepal.Length, Petal.Length, Petal.Width))

If variance is too large(or small) or the difference between two variables are too large,it will be distortion or unidentifiable,which may happen in different units or others.

So,a standardizing way is necessary by using scale = TRUE.

ggInterval_3Dscatter(myFacedata[1:8, ], aes(AD, BC, AH), scale = TRUE)

ggInterval_radar for visualizing the interval of multivariates


p1 <- ggInterval_radar(Environment, 
                 plotPartial = 2,
                 showLegend = F,
                 base_circle = T,
                 base_lty = 2,
                 addText = F) +
  labs(title = "") +
  theme_bw() +
  scale_fill_manual(values = c("gray50")) +
  scale_color_manual(values = c("gray50")) 


p2 <- ggInterval_radar(Environment, 
                       plotPartial = 7,
                       showLegend = F,
                       base_circle = F,
                       base_lty = 1,
                       addText = T) +
  labs(title = "") +
  theme_bw() +
  scale_fill_manual(values = c("gray50")) +
  scale_color_manual(values = c("gray50"))
ggpubr::ggarrange(p1, p2, ncol = 2)

It can also plot partial observations by plotPartial =,and the right hand of equation can put the row index of observations you want to see.

p1 <- ggInterval_radar(Environment,
                 plotPartial = c(1, 4),
                 showLegend = F,
                 addText = F) +
  scale_fill_manual(values = c("darkblue", "darkred")) +
  scale_color_manual(values = c("darkblue", "darkred"))

p2 <- ggInterval_radar(Environment,
                       plotPartial = c(1, 4),
                       showLegend = F,
                       addText = F,
                       base_circle = F,
                       base_lty = 1,
                       type = "rect") +
  scale_fill_manual(values = c("darkblue", "darkred")) +
  scale_color_manual(values = c("darkblue", "darkred"))
ggpubr::ggarrange(p1, p2, ncol = 2)

A quantile radar plot :


dataSetList <- list(AbaloneIdt = AbaloneIdt, 
                    BLOOD = BLOOD,
                    Cardiological = Cardiological,
                    facedata = facedata,
                    oils = oils,
                    mushroom = mushroom,
                    Environment = Environment)
myFill <- c("white", "gray10", "gray20",
            "gray30", "gray40", "gray50",
            "gray60", "gray70", "white",
            "white", "white")
myCol <- myFill; myCol[1] <- "black"
pList <- NULL
u <- 1
for(i in dataSetList){
  p <- ggInterval_radar(i,
                        base_circle = F,
                        base_lty = 1,
                        type = "quantile",
                        quantileNum = 10,
                        showLegend = F,
                        Drift = 0)+
    labs(title = names(dataSetList)[u]) +
    scale_fill_manual(values = rev(myFill)) +
    scale_color_manual(values = rev(myCol))
    ggthemes::theme_hc()
  pList[[u]] <- p
  u <- u + 1
}

gridExtra::marrangeGrob(pList, nrow = 2, ncol = 4,
                        top = "")

ggInterval_PCA for dimension reduction in interval data



CONCEPT <- rep(c("FRA", "HUS", "INC", "ISA", "JPL", "KHA",
           "LOT", "PHI", "ROM"), each = 3)
p <- ggInterval_PCA(facedata, poly = T,
                    concepts_group = CONCEPT)
p$ggplotPCA <- p$ggplotPCA + theme(legend.position = "top") + 
  theme_bw() 

p2 <- ggInterval_PCA(facedata, poly = F,
                    concepts_group = CONCEPT)
p2$ggplotPCA <- p2$ggplotPCA + theme(legend.position = "top") +
  theme_bw()

ggpubr::ggarrange(p$ggplotPCA, p2$ggplotPCA, ncol = 2)

myPCA <- p2
myPCA$loadings
#> 
#> Loadings:
#>      Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
#> [1,]  0.403  0.439  0.214  0.257  0.723  0.104
#> [2,]  0.306  0.497 -0.175 -0.779 -0.146       
#> [3,]  0.523 -0.147 -0.463  0.273 -0.248  0.595
#> [4,]  0.557         0.206  0.283 -0.423 -0.620
#> [5,] -0.297  0.466 -0.700  0.329        -0.311
#> [6,] -0.268  0.563  0.421  0.253 -0.465  0.392
#> 
#>                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
#> SS loadings     1.000  1.000  1.000  1.000  1.000  1.000
#> Proportion Var  0.167  0.167  0.167  0.167  0.167  0.167
#> Cumulative Var  0.167  0.333  0.500  0.667  0.833  1.000
cumsum(myPCA$sdev/sum(myPCA$sdev))
#>    Comp.1    Comp.2    Comp.3    Comp.4    Comp.5    Comp.6 
#> 0.2938526 0.5401279 0.6873056 0.8139909 0.9203008 1.0000000
head(myPCA$scores_interval[,1:3])
#>                PC_1           PC_2            PC_3
#> FRA1  [1.61 : 2.66]  [0.27 : 1.57]  [-1.00 : 0.29]
#> FRA2  [1.03 : 2.49] [-0.11 : 1.61]  [-1.01 : 0.25]
#> FRA3  [0.81 : 2.99] [-0.40 : 1.87]  [-1.20 : 0.88]
#> HUS1 [-1.10 : 0.24]  [0.39 : 2.05] [-2.13 : -0.64]
#> HUS2 [-1.41 : 0.40]  [0.56 : 2.65] [-2.32 : -0.29]
#> HUS3 [-1.42 : 0.24]  [0.43 : 2.52] [-2.17 : -0.27]

References

  1. Diday, Edwin; Esposito, Floriana (December 2003). “An introduction to symbolic data analysis and the SODAS software”.

  1. Lynne Billard; Edwin Diday (14 May 2012). Symbolic Data Analysis: Conceptual Statistics and Data Mining.