RScelestial Vignette

Mohammad-Hadi Foroughmand-Araabi

2023-11-30

library(RScelestial)
# We load igraph for drawing trees. If you do not want to draw,
# there is no need to import igraph.
library(igraph)
#> 
#> Attaching package: 'igraph'
#> The following objects are masked from 'package:stats':
#> 
#>     decompose, spectrum
#> The following object is masked from 'package:base':
#> 
#>     union

Installing RScelestial

The RScelestial package could be installed easily as follows

install.packages("RScelestial")

Simulation

Here we show a simulation. We build a data set with following command.

# Following command generates ten samples with 20 loci. 
# Rate of mutations on each edge of the evolutionary tree is 1.5. 
D = synthesis(10, 20, 5, seed = 7)
D
#> $seqeunce
#>     C1 C2 C3 C4 C5 C6 C7 C8 C9 C10
#> L1   0  0  0  3  0  1  0  3  0   0
#> L2   3  3  3  3  3  3  3  3  0   3
#> L3   0  3  3  0  3  1  0  0  0   3
#> L4   1  0  3  3  1  3  0  1  0   3
#> L5   0  3  3  1  3  3  0  0  0   0
#> L6   3  3  0  3  0  3  3  3  3   3
#> L7   0  0  0  0  3  0  3  0  3   0
#> L8   3  1  0  1  0  3  1  3  3   1
#> L9   0  0  3  3  3  3  3  0  0   3
#> L10  3  3  0  1  3  0  3  3  1   0
#> L11  0  3  0  0  0  0  3  1  3   0
#> L12  3  3  3  0  0  0  3  0  3   3
#> L13  3  1  1  3  0  3  0  0  0   3
#> L14  3  0  0  0  1  3  0  3  3   0
#> L15  0  3  0  3  0  0  3  3  1   0
#> L16  3  0  3  0  3  0  3  0  3   0
#> L17  3  3  0  3  3  3  0  3  1   3
#> L18  0  0  3  3  3  0  1  3  0   3
#> L19  3  3  0  1  3  0  0  3  0   3
#> L20  3  1  3  0  1  3  3  3  3   0
#> 
#> $true.sequence
#>     C1 C2 C3 C4 C5 C6 C7 C8 C9 C10
#> L1   0  0  0  0  0  0  0  0  0   0
#> L2   0  0  0  0  0  0  0  0  0   0
#> L3   0  0  0  0  0  0  0  0  0   0
#> L4   0  0  0  0  0  0  0  0  0   0
#> L5   0  0  0  0  0  0  0  0  0   0
#> L6   0  0  0  0  0  0  0  0  0   0
#> L7   0  0  0  0  0  0  0  0  0   0
#> L8   0  0  0  1  0  0  1  0  1   1
#> L9   0  0  0  0  0  0  0  0  0   0
#> L10  0  0  0  0  0  0  0  0  0   0
#> L11  0  0  0  0  0  0  0  0  0   0
#> L12  0  0  0  0  0  0  0  0  0   0
#> L13  0  0  0  0  0  0  0  0  0   0
#> L14  0  0  0  0  0  0  0  0  0   0
#> L15  0  0  0  0  0  0  0  0  0   0
#> L16  0  0  0  0  0  0  0  0  0   0
#> L17  0  0  0  0  0  0  0  0  0   0
#> L18  0  0  0  0  0  0  0  0  0   0
#> L19  0  0  0  0  0  0  0  0  0   0
#> L20  0  0  0  0  0  0  0  0  0   0
#> 
#> $true.clone
#> $true.clone$N1
#> [1] "C2" "C6"
#> 
#> $true.clone$N2
#> [1] "C5"
#> 
#> $true.clone$N3
#> [1] "C1" "C8"
#> 
#> $true.clone$N4
#> [1] "C3"
#> 
#> $true.clone$N6
#> [1] "C4"  "C7"  "C9"  "C10"
#> 
#> 
#> $true.tree
#>   src dest len
#> 1  N2   N1   1
#> 2  N3   N2   1
#> 3  N4   N2   1
#> 4  N6   N3   1

Inferring the phylogenetic tree

seq = as.ten.state.matrix(D$seqeunce)
SP = scelestial(seq, return.graph = TRUE)
SP
#> $input
#>      V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
#> C1  A/A ./. A/A C/C A/A ./. A/A ./. A/A ./. A/A ./. ./. ./. A/A ./. ./. A/A ./.
#> C10 A/A ./. ./. ./. A/A ./. A/A C/C ./. A/A A/A ./. ./. A/A A/A A/A ./. ./. ./.
#> C2  A/A ./. ./. A/A ./. ./. A/A C/C A/A ./. ./. ./. C/C A/A ./. A/A ./. A/A ./.
#> C3  A/A ./. ./. ./. ./. A/A A/A A/A ./. A/A A/A ./. C/C A/A A/A ./. A/A ./. A/A
#> C4  ./. ./. A/A ./. C/C ./. A/A C/C ./. C/C A/A A/A ./. A/A ./. A/A ./. ./. C/C
#> C5  A/A ./. ./. C/C ./. A/A ./. A/A ./. ./. A/A A/A A/A C/C A/A ./. ./. ./. ./.
#> C6  C/C ./. C/C ./. ./. ./. A/A ./. ./. A/A A/A A/A ./. ./. A/A A/A ./. A/A A/A
#> C7  A/A ./. A/A A/A A/A ./. ./. C/C ./. ./. ./. ./. A/A A/A ./. ./. A/A C/C A/A
#> C8  ./. ./. A/A C/C A/A ./. A/A ./. A/A ./. C/C A/A A/A ./. ./. A/A ./. ./. ./.
#> C9  A/A A/A A/A A/A A/A ./. ./. ./. A/A C/C ./. ./. A/A ./. C/C ./. C/C A/A A/A
#>     V20
#> C1  ./.
#> C10 A/A
#> C2  C/C
#> C3  ./.
#> C4  A/A
#> C5  C/C
#> C6  ./.
#> C7  ./.
#> C8  ./.
#> C9  ./.
#> 
#> $sequence
#>      V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
#> C1  A/A A/A A/A C/C A/A A/A A/A C/C A/A A/A A/A A/A A/A A/A A/A A/A C/C A/A A/A
#> C10 A/A A/A A/A A/A A/A A/A A/A C/C A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A
#> C2  A/A A/A A/A A/A A/A A/A A/A C/C A/A A/A A/A A/A C/C A/A A/A A/A A/A A/A A/A
#> C3  A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A C/C A/A A/A A/A A/A A/A A/A
#> C4  A/A A/A A/A A/A C/C A/A A/A C/C A/A C/C A/A A/A A/A A/A A/A A/A A/A A/A C/C
#> C5  A/A A/A A/A C/C A/A A/A A/A A/A A/A A/A A/A A/A A/A C/C A/A A/A A/A A/A A/A
#> C6  C/C A/A C/C A/A A/A A/A A/A C/C A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A
#> C7  A/A A/A A/A A/A A/A A/A A/A C/C A/A C/C A/A A/A A/A A/A C/C A/A A/A C/C A/A
#> C8  A/A A/A A/A C/C A/A A/A A/A A/A A/A A/A C/C A/A A/A A/A A/A A/A A/A A/A A/A
#> C9  A/A A/A A/A A/A A/A A/A A/A A/A A/A C/C A/A A/A A/A A/A C/C A/A C/C A/A A/A
#>     V20
#> C1  A/A
#> C10 A/A
#> C2  C/C
#> C3  A/A
#> C4  A/A
#> C5  C/C
#> C6  A/A
#> C7  A/A
#> C8  A/A
#> C9  A/A
#> 
#> $tree
#>   src dest     len
#> 1  C3  C10 6.00013
#> 2  C1   C8 6.00014
#> 3  C1  C10 6.00015
#> 4  C4  C10 6.25012
#> 5  C7   C9 6.50012
#> 6  C2  C10 6.50014
#> 7  C6  C10 6.50014
#> 8  C1   C5 6.75016
#> 9  C1   C9 7.00013
#> 
#> $graph
#> IGRAPH fd2a474 UNW- 10 9 -- 
#> + attr: name (v/c), weight (e/n)
#> + edges from fd2a474 (vertex names):
#> [1] C3 --C10 C1 --C8  C10--C1  C10--C4  C7 --C9  C10--C2  C10--C6  C1 --C5 
#> [9] C1 --C9

You can draw the graph with following command

tree.plot(SP, vertex.size = 30)

Also, we can make a rooted tree with cell “C8” as the root of the tree as follows:

SP = scelestial(seq, root.assign.method = "fix", root = "C8", return.graph = TRUE)
#> [1] "C8 -1"
#> [1] "C1 C8"
#> [1] "C10 C1"
#> [1] "C3 C10"
#> [1] "C4 C10"
#> [1] "C2 C10"
#> [1] "C6 C10"
#> [1] "C5 C1"
#> [1] "C9 C1"
#> [1] "C7 C9"
tree.plot(SP, vertex.size = 30)

Setting root.assign.method to “balance” lets the algorithm decide for a root that produces minimum height tree.

SP = scelestial(seq, root.assign.method = "balance", return.graph = TRUE)
#> [1] "C1 -1"
#> [1] "C8 C1"
#> [1] "C10 C1"
#> [1] "C3 C10"
#> [1] "C4 C10"
#> [1] "C2 C10"
#> [1] "C6 C10"
#> [1] "C5 C1"
#> [1] "C9 C1"
#> [1] "C7 C9"
#> [1] "C1 -1"
#> [1] "C8 C1"
#> [1] "C10 C1"
#> [1] "C3 C10"
#> [1] "C4 C10"
#> [1] "C2 C10"
#> [1] "C6 C10"
#> [1] "C5 C1"
#> [1] "C9 C1"
#> [1] "C7 C9"
tree.plot(SP, vertex.size = 30)

Evaluating results

Following command calculates the distance array between pairs of samples.

D.distance.matrix <- distance.matrix.true.tree(D)
D.distance.matrix
#>              C2          C6          C5          C1          C8          C3
#> C2  0.000000000 0.000000000 0.006849315 0.013698630 0.013698630 0.013698630
#> C6  0.000000000 0.000000000 0.006849315 0.013698630 0.013698630 0.013698630
#> C5  0.006849315 0.006849315 0.000000000 0.006849315 0.006849315 0.006849315
#> C1  0.013698630 0.013698630 0.006849315 0.000000000 0.000000000 0.013698630
#> C8  0.013698630 0.013698630 0.006849315 0.000000000 0.000000000 0.013698630
#> C3  0.013698630 0.013698630 0.006849315 0.013698630 0.013698630 0.000000000
#> C4  0.020547945 0.020547945 0.013698630 0.006849315 0.006849315 0.020547945
#> C7  0.020547945 0.020547945 0.013698630 0.006849315 0.006849315 0.020547945
#> C9  0.020547945 0.020547945 0.013698630 0.006849315 0.006849315 0.020547945
#> C10 0.020547945 0.020547945 0.013698630 0.006849315 0.006849315 0.020547945
#>              C4          C7          C9         C10
#> C2  0.020547945 0.020547945 0.020547945 0.020547945
#> C6  0.020547945 0.020547945 0.020547945 0.020547945
#> C5  0.013698630 0.013698630 0.013698630 0.013698630
#> C1  0.006849315 0.006849315 0.006849315 0.006849315
#> C8  0.006849315 0.006849315 0.006849315 0.006849315
#> C3  0.020547945 0.020547945 0.020547945 0.020547945
#> C4  0.000000000 0.000000000 0.000000000 0.000000000
#> C7  0.000000000 0.000000000 0.000000000 0.000000000
#> C9  0.000000000 0.000000000 0.000000000 0.000000000
#> C10 0.000000000 0.000000000 0.000000000 0.000000000
SP.distance.matrix <- distance.matrix.scelestial(SP)
SP.distance.matrix
#>              C1         C10          C2          C3          C4          C5
#> C1  0.000000000 0.004528317 0.009433976 0.009056619 0.009245286 0.005094350
#> C10 0.004528317 0.000000000 0.004905660 0.004528302 0.004716969 0.009622667
#> C2  0.009433976 0.004905660 0.000000000 0.009433961 0.009622629 0.014528326
#> C3  0.009056619 0.004528302 0.009433961 0.000000000 0.009245271 0.014150968
#> C4  0.009245286 0.004716969 0.009622629 0.009245271 0.000000000 0.014339636
#> C5  0.005094350 0.009622667 0.014528326 0.014150968 0.014339636 0.000000000
#> C6  0.009433976 0.004905660 0.009811319 0.009433961 0.009622629 0.014528326
#> C7  0.010188647 0.014716964 0.019622623 0.019245265 0.019433933 0.015282997
#> C8  0.004528309 0.009056626 0.013962286 0.013584928 0.013773595 0.009622659
#> C9  0.005283002 0.009811319 0.014716979 0.014339621 0.014528288 0.010377352
#>              C6          C7          C8          C9
#> C1  0.009433976 0.010188647 0.004528309 0.005283002
#> C10 0.004905660 0.014716964 0.009056626 0.009811319
#> C2  0.009811319 0.019622623 0.013962286 0.014716979
#> C3  0.009433961 0.019245265 0.013584928 0.014339621
#> C4  0.009622629 0.019433933 0.013773595 0.014528288
#> C5  0.014528326 0.015282997 0.009622659 0.010377352
#> C6  0.000000000 0.019622623 0.013962286 0.014716979
#> C7  0.019622623 0.000000000 0.014716956 0.004905644
#> C8  0.013962286 0.014716956 0.000000000 0.009811312
#> C9  0.014716979 0.004905644 0.009811312 0.000000000
## Difference between normalized distance matrices
vertices <- rownames(SP.distance.matrix)
sum(abs(D.distance.matrix[vertices,vertices] - SP.distance.matrix))
#> [1] 0.5453399

Running Scelestial on multiple sequence alignment

Given a multiple sequence alignment, Scelestial infers the phylogeny of them. Here we present a simple example. First we load libraries to load a multiple alignment.

library(stringr)
if (!require("seqinr")) install.packages("seqinr")
#> Loading required package: seqinr
library(seqinr)

In this example, we load a multiple alignment from seqinr package.

data(phylip, package = "seqinr")

Then we clean the data and build a zero-one matrix representing taxa and characters. Note that Scelestial accept matrices with taxa as its columns and characters as its rows.

# Removing non-informative columns and duplicate rows.
mcb <-  toupper(t(sapply(seq(phylip$seq), function(i) unlist(strsplit(phylip$seq[[i]], '')))))
ccb <- as.character(phylip$seq)
occb <- order(ccb)
cbColMask <- sapply(seq(ncol(mcb)), function(j) length(levels(as.factor(mcb[,j]))) == 1)
cbRowMask <- rep(TRUE, length(ccb))
for (i in seq(length(ccb))) {
    if (i == 1 || ccb[occb[i]] != ccb[occb[i-1]]) {
        cbRowMask[occb[i]] <- FALSE
    }
}
mcbRows <- apply(mcb[!cbRowMask, !cbColMask], MARGIN = 1, FUN = function(a) paste0(str_replace(a, "-", "X"), collapse = ""))

Executing Scelestial on the input matrix.

n.seq <- data.frame(nodes = phylip$nam[!cbRowMask], seq = mcbRows)
seq2 <- data.frame(t(as.ten.state.matrix.from.node.seq(n.seq)), stringsAsFactors = TRUE)
# Running Scelestial
SP = scelestial(seq2, return.graph = TRUE)
tree.plot(SP, vertex.size=20, vertex.label.dist=0, asp = 0, vertex.label.cex = 1)