## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----------------------------------------------------------------------------- library(dplyr) library(ggplot2) library(fastRG) library(irlba) library(Matrix) library(purrr) library(scales) library(tidyr) set.seed(27) model_distinct <- function(n, k = 5) { B <- matrix(0.05, nrow = k, ncol = k) diag(B) <- seq(0.8, 0.4, length.out = k) latent <- dcsbm( theta = rexp(n) + 1, B = B, expected_degree = 0.1 * n ) } model_repeated <- function(n, k = 5) { latent <- dcsbm( theta = rep(1, n), B = diag(0.5, k), expected_degree = 0.1 * n ) } ## ----------------------------------------------------------------------------- # sample the latent parameters of the blockmodel latent <- model_distinct(1000) # compute the population singular value decomposition of the blockmodel s_pop <- svds(latent) # sample a network conditional on the latent factors A <- sample_sparse(latent) # singular value decomposition of the observed network s_obs <- irlba(A, 5) # difference between population and sample singular values s_pop$d - s_obs$d ## ----------------------------------------------------------------------------- sin_theta_distance <- function(u, v) { s <- svd(crossprod(u, v)) ncol(u) - sum(s$d^2) } find_procrustes_rotation <- function(X, Y) { s <- svd(crossprod(X, Y)) tcrossprod(s$v, s$u) # NOTE U, V swap versus next one } procrustes_align <- function(X, Y) { s <- svd(crossprod(X, Y)) rotation <- tcrossprod(s$v, s$u) Y %*% rotation } two_to_infinity_loss <- function(X, Y) { s <- svd(crossprod(X, Y)) rotation <- tcrossprod(s$v, s$u) Yrot <- Y %*% rotation diff <- X - Yrot max(sqrt(Matrix::rowSums(diff^2))) } loss_helper <- function(s_pop, s_obs) { u_pop <- s_pop$u u_obs <- s_obs$u d_pop <- s_pop$d d_obs <- s_obs$d x_pop <- u_pop %*% sqrt(diag(d_pop)) x_obs <- u_obs %*% sqrt(diag(d_obs)) spectral_loss_relative <- abs(d_pop - d_obs) / d_pop column_sin_theta_loss <- map_dbl( 1:ncol(u_pop), \(j) { sin_theta_distance(u_pop[, j, drop = FALSE], u_obs[, j, drop = FALSE]) } ) tibble( sin_theta_loss = sin_theta_distance(u_pop, u_obs), u_two_inf_loss = two_to_infinity_loss(u_pop, u_obs), x_two_inf_loss = two_to_infinity_loss(x_pop, x_obs), spectral_loss_relative1 = spectral_loss_relative[1], spectral_loss_relative2 = spectral_loss_relative[2], spectral_loss_relative3 = spectral_loss_relative[3], spectral_loss_relative4 = spectral_loss_relative[4], spectral_loss_relative5 = spectral_loss_relative[5], sin_theta_loss1 = column_sin_theta_loss[1], sin_theta_loss2 = column_sin_theta_loss[2], sin_theta_loss3 = column_sin_theta_loss[3], sin_theta_loss4 = column_sin_theta_loss[4], sin_theta_loss5 = column_sin_theta_loss[5] ) } ## ----------------------------------------------------------------------------- run_simulation <- function(model, num_reps = 30) { expand_grid( n = c(100, 180, 330, 600, 1100, 2000), reps = 1:num_reps ) |> mutate( pop = map(n, model), s_pop = map(pop, svds), A = map(pop, sample_sparse), s_obs = map(A, irlba, 5), # rank five svd loss = map2(s_pop, s_obs, loss_helper) ) } sims <- run_simulation(model_distinct) sims ## ----------------------------------------------------------------------------- summarize_simulations <- function(sims) { sims |> unnest_wider(c(loss)) |> select(contains("loss"), everything()) |> summarize( across(contains("loss"), mean), .by = n ) |> pivot_longer( contains("loss"), names_to = "loss_type", values_to = "loss" ) |> mutate( loss_type = recode( loss_type, "sin_theta_loss" = "Sin Theta Loss", "u_two_inf_loss" = "Two-to-infinity (un-scaled)", "x_two_inf_loss" = "Two-to-infinity (scaled)", "spectral_loss" = "Spectral", "sin_theta_loss1" = "Sin Theta Loss (column 1)", "sin_theta_loss2" = "Sin Theta Loss (column 2)", "sin_theta_loss3" = "Sin Theta Loss (column 3)", "sin_theta_loss4" = "Sin Theta Loss (column 4)", "sin_theta_loss5" = "Sin Theta Loss (column 5)", "spectral_loss_relative1" = "Rel. error (sigma 1)", "spectral_loss_relative2" = "Rel. error (sigma 2)", "spectral_loss_relative3" = "Rel. error (sigma 3)", "spectral_loss_relative4" = "Rel. error (sigma 4)", "spectral_loss_relative5" = "Rel. error (sigma 5)", ) ) } results <- summarize_simulations(sims) results ## ----------------------------------------------------------------------------- plot_results <- function(results) { results |> ggplot() + aes(x = n, y = loss, color = loss_type) + geom_line() + scale_x_log10(labels = label_log(digits = 2)) + scale_y_log10(labels = label_log(digits = 2)) + scale_color_viridis_d(begin = 0.15, end = 0.85, guide = "none") + facet_wrap(vars(loss_type), scales = "free_y") + labs( title = "Estimation error in adjacency spectral embedding", y = "Estimation error (log scale)", x = "Number of nodes (log scale)" ) + theme_minimal() } plot_results(results) ## ----------------------------------------------------------------------------- model_repeated |> run_simulation() |> summarize_simulations() |> plot_results() ## ----------------------------------------------------------------------------- graph_laplacian <- function(A) { degrees <- rowSums(A) tau <- mean(degrees) DA <- rowScale(A, 1 / sqrt(degrees + tau)) colScale(DA, 1 / sqrt(degrees + tau)) } svds_laplacian <- function(pop) { # regularize by expected mean degree (scalar) tau <- expected_degree(pop) # vector!! degrees_pop <- expected_degrees(pop) # rescale X in the population model so that XSX' is the expected # graph Laplacian. we can't use this to sample networks anymore, but # we can use it to bootstrap a population SVD calculation pop$X <- rowScale(pop$X, 1 / sqrt(degrees_pop + tau)) svds(pop) } run_laplacian_simulation <- function(model, num_reps = 30) { expand_grid( n = c(100, 180, 330, 600, 1100, 2000), reps = 1:num_reps ) |> mutate( pop = map(n, model), s_pop = map(pop, svds_laplacian), A = map(pop, sample_sparse), L = map(A, graph_laplacian), s_obs = map(L, irlba, 5), # rank five svd, loss = map2(s_pop, s_obs, loss_helper) ) } ## ----------------------------------------------------------------------------- model_distinct |> run_laplacian_simulation() |> summarize_simulations() |> plot_results() ## ----------------------------------------------------------------------------- sims |> mutate(diagnostics = map2(s_pop, s_obs, function(pop, obs) { tibble( rank = 1:length(pop$d), val_pop = pop$d, val_obs = obs$d ) })) |> select(n, reps, diagnostics) |> unnest(diagnostics) |> pivot_longer( cols = c(val_pop, val_obs), names_to = "type", values_to = "value" ) |> mutate(type = recode(type, val_pop = "Population", val_obs = "Observed")) |> ggplot(aes( x = factor(rank), y = value, color = type, group = interaction(reps, type) )) + geom_point(position = position_jitter(width = 0.1), alpha = 0.5, size = 1.5) + geom_line(alpha = 0.3) + facet_wrap( ~ n, scales = "free_y", labeller = label_both) + scale_color_manual(values = c( "Population" = "#377eb8", "Observed" = "#e41a1c" )) + labs( title = "Scree Plots: Population vs Observed", subtitle = "Comparing singular values across multiple simulation replicates", x = "Index of singular value", y = "Singular Value", color = "Spectrum" ) + theme_minimal() + theme(legend.position = "bottom")