## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
library(dplyr)
library(ggplot2)
library(fastRG)
library(irlba)
library(Matrix)
library(purrr)
library(scales)
library(tidyr)

set.seed(27)

model_distinct <- function(n, k = 5) {
  B <- matrix(0.05, nrow = k, ncol = k)
  diag(B) <- seq(0.8, 0.4, length.out = k)
  latent <- dcsbm(
    theta = rexp(n) + 1,
    B = B,
    expected_degree = 0.1 * n
  )
}

model_repeated <- function(n, k = 5) {
  latent <- dcsbm(
    theta = rep(1, n),
    B = diag(0.5, k),
    expected_degree = 0.1 * n
  )
}

## -----------------------------------------------------------------------------
# sample the latent parameters of the blockmodel
latent <- model_distinct(1000)

# compute the population singular value decomposition of the blockmodel
s_pop <- svds(latent)

# sample a network conditional on the latent factors
A <- sample_sparse(latent)

# singular value decomposition of the observed network
s_obs <- irlba(A, 5)

# difference between population and sample singular values
s_pop$d - s_obs$d

## -----------------------------------------------------------------------------
sin_theta_distance <- function(u, v) {
  s <- svd(crossprod(u, v))
  ncol(u) - sum(s$d^2)
}

find_procrustes_rotation <- function(X, Y) {
  s <- svd(crossprod(X, Y))
  tcrossprod(s$v, s$u) # NOTE U, V swap versus next one
}

procrustes_align <- function(X, Y) {
  s <- svd(crossprod(X, Y))
  rotation <- tcrossprod(s$v, s$u)
  Y %*% rotation
}

two_to_infinity_loss <- function(X, Y) {
  s <- svd(crossprod(X, Y))
  rotation <- tcrossprod(s$v, s$u)
  Yrot <- Y %*% rotation
  diff <- X - Yrot
  max(sqrt(Matrix::rowSums(diff^2)))
}

loss_helper <- function(s_pop, s_obs) {
  u_pop <- s_pop$u
  u_obs <- s_obs$u

  d_pop <- s_pop$d
  d_obs <- s_obs$d

  x_pop <- u_pop %*% sqrt(diag(d_pop))
  x_obs <- u_obs %*% sqrt(diag(d_obs))

  spectral_loss_relative <- abs(d_pop - d_obs) / d_pop
  
  column_sin_theta_loss <- map_dbl(
    1:ncol(u_pop),
    \(j) {
      sin_theta_distance(u_pop[, j, drop = FALSE], u_obs[, j, drop = FALSE])
    }
  )

  tibble(
    sin_theta_loss = sin_theta_distance(u_pop, u_obs),
    u_two_inf_loss = two_to_infinity_loss(u_pop, u_obs),
    x_two_inf_loss = two_to_infinity_loss(x_pop, x_obs),
    spectral_loss_relative1 = spectral_loss_relative[1],
    spectral_loss_relative2 = spectral_loss_relative[2],
    spectral_loss_relative3 = spectral_loss_relative[3],
    spectral_loss_relative4 = spectral_loss_relative[4],
    spectral_loss_relative5 = spectral_loss_relative[5],
    sin_theta_loss1 = column_sin_theta_loss[1],
    sin_theta_loss2 = column_sin_theta_loss[2],
    sin_theta_loss3 = column_sin_theta_loss[3],
    sin_theta_loss4 = column_sin_theta_loss[4],
    sin_theta_loss5 = column_sin_theta_loss[5]
  )
}

## -----------------------------------------------------------------------------

run_simulation <- function(model, num_reps = 30) {
  expand_grid(
    n = c(100, 180, 330, 600, 1100, 2000),
    reps = 1:num_reps
  ) |>
    mutate(
      pop = map(n, model),
      s_pop = map(pop, svds),
      A = map(pop, sample_sparse),
      s_obs = map(A, irlba, 5), # rank five svd
      loss = map2(s_pop, s_obs, loss_helper)
    )
}

sims <- run_simulation(model_distinct)
sims

## -----------------------------------------------------------------------------
summarize_simulations <- function(sims) {
  sims |>
    unnest_wider(c(loss)) |>
    select(contains("loss"), everything()) |>
    summarize(
      across(contains("loss"), mean),
      .by = n
    ) |>
    pivot_longer(
      contains("loss"),
      names_to = "loss_type",
      values_to = "loss"
    ) |>
    mutate(
      loss_type = recode(
        loss_type,
        "sin_theta_loss" = "Sin Theta Loss",
        "u_two_inf_loss" = "Two-to-infinity (un-scaled)",
        "x_two_inf_loss" = "Two-to-infinity (scaled)",
        "spectral_loss" = "Spectral",
        "sin_theta_loss1" = "Sin Theta Loss (column 1)",
        "sin_theta_loss2" = "Sin Theta Loss (column 2)",
        "sin_theta_loss3" = "Sin Theta Loss (column 3)",
        "sin_theta_loss4" = "Sin Theta Loss (column 4)",
        "sin_theta_loss5" = "Sin Theta Loss (column 5)",
        "spectral_loss_relative1" = "Rel. error (sigma 1)",
        "spectral_loss_relative2" = "Rel. error (sigma 2)",
        "spectral_loss_relative3" = "Rel. error (sigma 3)",
        "spectral_loss_relative4" = "Rel. error (sigma 4)",
        "spectral_loss_relative5" = "Rel. error (sigma 5)",
      )
    )
}

results <- summarize_simulations(sims)
results

## -----------------------------------------------------------------------------
plot_results <- function(results) {
  results |>
    ggplot() +
    aes(x = n, y = loss, color = loss_type) +
    geom_line() +
    scale_x_log10(labels = label_log(digits = 2)) +
    scale_y_log10(labels = label_log(digits = 2)) +
    scale_color_viridis_d(begin = 0.15, end = 0.85, guide = "none") +
    facet_wrap(vars(loss_type), scales = "free_y") +
    labs(
      title = "Estimation error in adjacency spectral embedding",
      y = "Estimation error (log scale)",
      x = "Number of nodes (log scale)"
    ) +
    theme_minimal()
}

plot_results(results)

## -----------------------------------------------------------------------------
model_repeated |>
  run_simulation() |>
  summarize_simulations() |>
  plot_results()

## -----------------------------------------------------------------------------
graph_laplacian <- function(A) {
  degrees <- rowSums(A)
  tau <- mean(degrees)
  DA <- rowScale(A, 1 / sqrt(degrees + tau))
  colScale(DA, 1 / sqrt(degrees + tau))
}

svds_laplacian <- function(pop) {
  # regularize by expected mean degree (scalar)
  tau <- expected_degree(pop)
  # vector!!
  degrees_pop <- expected_degrees(pop)

  # rescale X in the population model so that XSX' is the expected
  # graph Laplacian. we can't use this to sample networks anymore, but
  # we can use it to bootstrap a population SVD calculation
  pop$X <- rowScale(pop$X, 1 / sqrt(degrees_pop + tau))
  svds(pop)
}

run_laplacian_simulation <- function(model, num_reps = 30) {
  expand_grid(
    n = c(100, 180, 330, 600, 1100, 2000),
    reps = 1:num_reps
  ) |>
    mutate(
      pop = map(n, model),
      s_pop = map(pop, svds_laplacian),
      A = map(pop, sample_sparse),
      L = map(A, graph_laplacian),
      s_obs = map(L, irlba, 5), # rank five svd,
      loss = map2(s_pop, s_obs, loss_helper)
    )
}

## -----------------------------------------------------------------------------
model_distinct |>
  run_laplacian_simulation() |>
  summarize_simulations() |>
  plot_results()

## -----------------------------------------------------------------------------
sims |>
  mutate(diagnostics = map2(s_pop, s_obs, function(pop, obs) {
    tibble(
      rank = 1:length(pop$d),
      val_pop = pop$d,
      val_obs = obs$d
    )
  })) |>
  select(n, reps, diagnostics) |>
  unnest(diagnostics) |>
  pivot_longer(
    cols = c(val_pop, val_obs),
    names_to = "type",
    values_to = "value"
  ) |>
  mutate(type = recode(type, val_pop = "Population", val_obs = "Observed")) |>
  ggplot(aes(
    x = factor(rank),
    y = value,
    color = type,
    group = interaction(reps, type)
  )) +
  geom_point(position = position_jitter(width = 0.1),
             alpha = 0.5,
             size = 1.5) +
  geom_line(alpha = 0.3) +
  facet_wrap( ~ n, scales = "free_y", labeller = label_both) +
  scale_color_manual(values = c(
    "Population" = "#377eb8",
    "Observed" = "#e41a1c"
  )) +
  labs(
    title = "Scree Plots: Population vs Observed",
    subtitle = "Comparing singular values across multiple simulation replicates",
    x = "Index of singular value",
    y = "Singular Value",
    color = "Spectrum"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")